def test_different_hit_data_types(self): # Define a different hit data structure with standard names but different data types and number of fields. Numba automatically recompiles and the result should not change hit_data_types = [] hit_data_types.append([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u4'), ('row', '<u4'), ('charge', '<u1'), ('parameter', '<i4')]) hit_data_types.append([('event_number', '<i4'), ('frame', '<u8'), ('column', '<u2'), ('row', '<i2'), ('charge', '<u1'), ('parameter', '<i4'), ('parameter2', 'f4')]) # Initialize clusterizer clusterizer = HitClusterizer(pure_python=self.pure_python) # Define expected output expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')])) expected_cluster_result['event_number'] = [0, 1, 2, 3] expected_cluster_result['n_hits'] = [3, 3, 3, 1] expected_cluster_result['charge'] = [1, 2, 1, 1] expected_cluster_result['seed_column'] = [2, 4, 8, 10] expected_cluster_result['seed_row'] = [3, 7, 15, 19] expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0] expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0] for hit_data_type in hit_data_types: clusterizer.set_hit_dtype(np.dtype(hit_data_type)) # Create fake data with actual hit data structure hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2, hit_dtype=np.dtype(hit_data_type)) # Define expected output. Cluster hit data types are different and thus the expected results have to have different data types hit_data_type.extend([('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')]) expected_hit_result = np.zeros(shape=(10, ), dtype=hit_data_type) expected_hit_result['event_number'] = hits['event_number'] expected_hit_result['frame'] = hits['frame'] expected_hit_result['column'] = hits['column'] expected_hit_result['row'] = hits['row'] expected_hit_result['charge'] = hits['charge'] expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1] expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1] expected_hit_result['n_cluster'] = 1 hits_clustered, clusters = clusterizer.cluster_hits(hits) # Cluster hits # Test results self.assertTrue((clusters == expected_cluster_result).all()) self.assertTrue((hits_clustered == expected_hit_result).all())
def test_custom_cluster_fields(self): # Define a different cluster data structure with different names but standard data types. cluster_dtype = np.dtype([('eventNumber', '<i8'), ('ID', '<u2'), ('size', '<u2'), ('tot', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')]) cluster_fields = {'eventNumber': 'event_number', 'ID': 'ID', 'size': 'n_hits', 'tot': 'charge', 'seed_column': 'seed_column', 'seed_row': 'seed_row', 'mean_column': 'mean_column', 'mean_row': 'mean_row' } # Initialize clusterizer and cluster test hits with self defined data type names clusterizer = HitClusterizer(cluster_fields=cluster_fields, cluster_dtype=cluster_dtype, pure_python=self.pure_python) hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) hits_clustered, clusters = clusterizer.cluster_hits(hits) # Define expected output expected_cluster_result = np.zeros(shape=(4, ), dtype=cluster_dtype) expected_cluster_result['eventNumber'] = [0, 1, 2, 3] expected_cluster_result['size'] = [3, 3, 3, 1] expected_cluster_result['tot'] = [1, 2, 1, 1] expected_cluster_result['seed_column'] = [2, 4, 8, 10] expected_cluster_result['seed_row'] = [3, 7, 15, 19] expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0] expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0] # Define expected output. Cluster hit data types are different and thus the expected results have to have different data types expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_hit_result['event_number'] = hits['event_number'] expected_hit_result['frame'] = hits['frame'] expected_hit_result['column'] = hits['column'] expected_hit_result['row'] = hits['row'] expected_hit_result['charge'] = hits['charge'] expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1] expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1] expected_hit_result['n_cluster'] = 1 self.assertTrue((clusters == expected_cluster_result).all()) self.assertTrue((hits_clustered == expected_hit_result).all())
def test_adding_cluster_field(self): clusterizer = HitClusterizer(pure_python=self.pure_python) hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) # Define expected cluster output with extra field expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4'), ('extra_field', 'f4')])) expected_cluster_result['event_number'] = [0, 1, 2, 3] expected_cluster_result['n_hits'] = [3, 3, 3, 1] expected_cluster_result['charge'] = [1, 2, 1, 1] expected_cluster_result['seed_column'] = [2, 4, 8, 10] expected_cluster_result['seed_row'] = [3, 7, 15, 19] expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0] expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0] expected_cluster_result['extra_field'] = [0., 0., 0., 0.] # Define expected hit clustered output expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_hit_result['event_number'] = hits['event_number'] expected_hit_result['frame'] = hits['frame'] expected_hit_result['column'] = hits['column'] expected_hit_result['row'] = hits['row'] expected_hit_result['charge'] = hits['charge'] expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1] expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1] expected_hit_result['n_cluster'] = 1 clusterizer.add_cluster_field(description=('extra_field', 'f4')) hits_clustered, clusters = clusterizer.cluster_hits(hits) self.assertTrue((clusters == expected_cluster_result).all()) self.assertTrue((hits_clustered == expected_hit_result).all())
def test_set_end_of_cluster_function(self): # Initialize clusterizer object clusterizer = HitClusterizer(pure_python=self.pure_python) hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) # Define expected output expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4'), ('seed_charge', 'f4')])) expected_cluster_result['event_number'] = [0, 1, 2, 3] expected_cluster_result['n_hits'] = [3, 3, 3, 1] expected_cluster_result['charge'] = [1, 2, 1, 1] expected_cluster_result['seed_column'] = [2, 4, 8, 10] expected_cluster_result['seed_row'] = [3, 7, 15, 19] expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0] expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0] expected_cluster_result['seed_charge'] = [1., 1., 1., 1.] # expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_hit_result['event_number'] = hits['event_number'] expected_hit_result['frame'] = hits['frame'] expected_hit_result['column'] = hits['column'] expected_hit_result['row'] = hits['row'] expected_hit_result['charge'] = hits['charge'] expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1] expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1] expected_hit_result['n_cluster'] = 1 clusterizer.add_cluster_field(description=('seed_charge', 'f4')) # Add an additional field to hold the result of the end_of_cluster_function calculation (here: seed charge) # The end of loop function has to define all of the following arguments, even when they are not used # It has to be compile able by numba in non python mode # This end_of_cluster_function sets the additional seed_charge field def end_of_cluster_function(hits, cluster, is_seed, n_cluster, cluster_size, cluster_id, actual_cluster_index, actual_event_hit_index, actual_cluster_hit_indices, seed_index): cluster[actual_cluster_index]['seed_charge'] = hits[seed_index]['charge'] clusterizer.set_end_of_cluster_function(end_of_cluster_function) # Set the new end_of_cluster_function # Main function hits_clustered, cluster = clusterizer.cluster_hits(hits) # cluster hits self.assertTrue((cluster == expected_cluster_result).all()) self.assertTrue((hits_clustered == expected_hit_result).all())
def test_chunked_clustering(self): # Big tables have to be chunked and analyzed with clusterizer.cluster_hits(hits_chunk) calls clusterizer = HitClusterizer(pure_python=self.pure_python) hits = create_hits(n_hits=100, max_column=100, max_row=100, max_frame=1, max_charge=2) hits_clustered, cluster = clusterizer.cluster_hits(hits) # Cluster all at once hits_clustered, cluster = hits_clustered.copy(), cluster.copy() # Be aware that the returned array are references to be stored! An additional call of clusterizer.cluster_hits will overwrite the data hits_clustered_chunked, cluster_chunked = None, None chunk_size = 6 # Chunk size has to be chosen to not split events between chunks! for i in range(int(100 / chunk_size + 1)): # Cluster in chunks hits_chunk = hits[i * chunk_size:i * chunk_size + chunk_size] hits_clustered_chunk, cluster_chunk = clusterizer.cluster_hits(hits_chunk) if hits_clustered_chunked is None: hits_clustered_chunked = hits_clustered_chunk.copy() else: hits_clustered_chunked = np.append(hits_clustered_chunked, hits_clustered_chunk) if cluster_chunked is None: cluster_chunked = cluster_chunk.copy() else: cluster_chunked = np.append(cluster_chunked, cluster_chunk) self.assertTrue((hits_clustered == hits_clustered_chunked).all()) self.assertTrue((cluster == cluster_chunked).all())
def test_exceptions(self): # TEST 1: Check to add more hits than supported hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) clusterizer = HitClusterizer(pure_python=self.pure_python) clusterizer.set_max_cluster_hits(1) with self.assertRaises(OutOfRangeError): clusterizer.cluster_hits(hits) # TEST 2: Set Custom mapping that is correct and should not throw an exception hit_mapping = {'event_number': 'event_number', 'column': 'column', 'row': 'row', 'charge': 'charge', 'frame': 'frame' } hit_dtype = np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2')]) clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype, pure_python=self.pure_python) # TEST 3: Set custom clustered hit struct that is incorrect and should throw an exception hit_dtype_new = np.dtype([('not_defined', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2')]) with self.assertRaises(ValueError): clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype_new, pure_python=self.pure_python) # TEST 4 Set custom and correct hit mapping, no eception expected hit_mapping = {'not_defined': 'event_number', 'column': 'column', 'row': 'row', 'charge': 'charge', 'frame': 'frame' } clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype_new, pure_python=self.pure_python)
def cluster_hits(input_hits_file, output_cluster_file=None, max_x_distance=3, max_y_distance=3, max_time_distance=2, dut_name=None, plot=True, max_cluster_hits=1000, max_hit_charge=13, chunk_size=1000000): '''Clusters the hits in the data file containing the hit table. Parameters ---------- data_file : pytables file output_file : pytables file ''' logging.info('=== Cluster hits in %s ===', input_hits_file) if not output_cluster_file: output_cluster_file = os.path.splitext( input_hits_file)[0] + '_cluster.h5' with tb.open_file(input_hits_file, 'r') as input_file_h5: with tb.open_file(output_cluster_file, 'w') as output_file_h5: # create clusterizer object clusterizer = HitClusterizer() clusterizer.set_max_hits(chunk_size) clusterizer.set_max_cluster_hits(max_cluster_hits) clusterizer.set_max_hit_charge(max_hit_charge) # Set clusterzier settings clusterizer.create_cluster_hit_info_array( False) # do not create cluster infos for hits clusterizer.set_x_cluster_distance( max_x_distance) # cluster distance in columns clusterizer.set_y_cluster_distance( max_y_distance) # cluster distance in rows clusterizer.set_frame_cluster_distance( max_time_distance) # cluster distance in time frames # Output data cluster_table_description = np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')]) cluster_table_out = output_file_h5.create_table( output_file_h5.root, name='Cluster', description=cluster_table_description, title='Clustered hits', filters=tb.Filters(complib='blosc', complevel=5, fletcher32=False)) for hits, _ in analysis_utils.data_aligned_at_events( input_file_h5.root.Hits, chunk_size=chunk_size, try_speedup=False): if not np.all(np.diff(hits['event_number']) >= 0): raise RuntimeError( 'The event number does not always increase. The hits cannot be used like this!' ) __, cluster = clusterizer.cluster_hits(hits) # Cluster hits if not np.all(np.diff(cluster['event_number']) >= 0): raise RuntimeError( 'The event number does not always increase. The cluster cannot be used like this!' ) cluster_table_out.append(cluster) if plot: plot_cluster_size(input_cluster_file=output_cluster_file, dut_name=dut_name) return output_cluster_file
def test_cluster_algorithm(self): # Check with multiple jumps data # Inititalize Clusterizer clusterizer = HitClusterizer(pure_python=self.pure_python) # TEST 1 hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) clusterizer.cluster_hits(hits) # cluster hits _, clusters = clusterizer.get_hit_cluster(), clusterizer.get_cluster() # Define expected output expected_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')])) expected_result['event_number'] = [0, 1, 2, 3] expected_result['n_hits'] = [3, 3, 3, 1] expected_result['charge'] = [1, 2, 1, 1] expected_result['seed_column'] = [2, 4, 8, 10] expected_result['seed_row'] = [3, 7, 15, 19] expected_result['mean_column'] = [2.0, 5.0, 8.0, 10.0] expected_result['mean_row'] = [3.0, 9.0, 15.0, 19.0] # Test results self.assertTrue((clusters == expected_result).all()) # TEST 2 clusterizer.create_cluster_hit_info_array(True) hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2) clusterizer.cluster_hits(hits) # cluster hits cluster_hits, clusters = clusterizer.get_hit_cluster(), clusterizer.get_cluster() # Define expected output expected_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_result['event_number'] = hits['event_number'] expected_result['frame'] = hits['frame'] expected_result['column'] = hits['column'] expected_result['row'] = hits['row'] expected_result['charge'] = hits['charge'] expected_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1] expected_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1] expected_result['n_cluster'] = 1 # Test results self.assertEqual(cluster_hits.shape[0], 10) # hit clustering activated, thus this array have 10 entries self.assertTrue(np.array_equal(cluster_hits, expected_result))
def test_cluster_cuts(self): # Create some fake data hits = np.ones(shape=(2, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2')])) hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 17, 36, 30, 19 hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19 # Create clusterizer object clusterizer = HitClusterizer(pure_python=self.pure_python) clusterizer.create_cluster_hit_info_array(True) # Case 1: Test max hit charge cut, accept all hits clusterizer.set_max_hit_charge(30) # only add hits with charge <= 30 clusterizer.cluster_hits(hits) # cluster hits # Check cluster cluster = clusterizer.get_cluster() expected_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')])) expected_result['event_number'] = [19] expected_result['n_hits'] = [2] expected_result['charge'] = [36] expected_result['seed_column'] = [17] expected_result['seed_row'] = [36] expected_result['mean_column'] = [17.18420982] expected_result['mean_row'] = [36.0] self.assertTrue(np.array_equal(cluster, expected_result)) # Check cluster hit info cluster_hits = clusterizer.get_hit_cluster() expected_result = np.zeros(shape=(2, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_result['event_number'] = hits['event_number'] expected_result['frame'] = hits['frame'] expected_result['column'] = hits['column'] expected_result['row'] = hits['row'] expected_result['charge'] = hits['charge'] expected_result['is_seed'] = [1, 0] expected_result['cluster_size'] = [2, 2] expected_result['n_cluster'] = 1 self.assertTrue(np.array_equal(cluster_hits, expected_result)) # Case 2: Test max hit charge cut, omit charge > 29 hits hits['event_number'] = 20 clusterizer.set_max_hit_charge(29) # only add hits with charge <= 30 clusterizer.cluster_hits(hits) # cluster hits # Check cluster cluster = clusterizer.get_cluster() expected_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')])) expected_result['event_number'] = [20] expected_result['n_hits'] = [1] expected_result['charge'] = [6] expected_result['seed_column'] = [18] expected_result['seed_row'] = [36] expected_result['mean_column'] = [18.0] expected_result['mean_row'] = [36.0] self.assertTrue(np.array_equal(cluster, expected_result)) # Check cluster hit info cluster_hits = clusterizer.get_hit_cluster() expected_result = np.zeros(shape=(2, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_result['event_number'] = hits['event_number'] expected_result['frame'] = hits['frame'] expected_result['column'] = hits['column'] expected_result['row'] = hits['row'] expected_result['charge'] = hits['charge'] expected_result['cluster_ID'] = [-1, 0] expected_result['is_seed'] = [0, 1] expected_result['cluster_size'] = [0, 1] expected_result['n_cluster'] = [1, 1] self.assertTrue(np.array_equal(cluster_hits, expected_result)) # Case 3: Add the same hit within an event # Create some fake data hits = np.ones(shape=(3, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2')])) hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 18, 36, 6, 19 hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19 hits[2]['column'], hits[2]['row'], hits[2]['charge'], hits[2]['event_number'] = 18, 38, 6, 19 expected_hit_result = np.zeros(shape=(3, ), dtype=np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2'), ('cluster_ID', '<i2'), ('is_seed', '<u1'), ('cluster_size', '<u2'), ('n_cluster', '<u2')])) expected_cluster_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'), ('ID', '<u2'), ('n_hits', '<u2'), ('charge', 'f4'), ('seed_column', '<u2'), ('seed_row', '<u2'), ('mean_column', 'f4'), ('mean_row', 'f4')])) expected_hit_result['event_number'] = hits['event_number'] expected_hit_result['frame'] = hits['frame'] expected_hit_result['column'] = hits['column'] expected_hit_result['row'] = hits['row'] expected_hit_result['charge'] = hits['charge'] expected_hit_result['cluster_ID'] = [0, -2, 0] expected_hit_result['is_seed'] = [1, 0, 0] expected_hit_result['cluster_size'] = [2, 0, 2] expected_hit_result['n_cluster'] = [1, 1, 1] expected_cluster_result['event_number'] = [19] expected_cluster_result['n_hits'] = [2] expected_cluster_result['charge'] = [12] expected_cluster_result['seed_column'] = [18] expected_cluster_result['seed_row'] = [36] expected_cluster_result['mean_column'] = [18.0] expected_cluster_result['mean_row'] = [37.0] clusterizer.ignore_same_hits(True) # If a hit occured 2 times in an event it is ignored and gets the cluster index -2 cluster_hits, cluster = clusterizer.cluster_hits(hits) # Cluster hits self.assertTrue(np.array_equal(cluster_hits, expected_hit_result)) self.assertTrue(np.array_equal(cluster, expected_cluster_result)) clusterizer.ignore_same_hits(False) # If a hit occured 2 times in an event it is used as a normal hit cluster_hits, cluster = clusterizer.cluster_hits(hits) # Cluster hits expected_hit_result['cluster_ID'] = [0, 0, 0] expected_hit_result['is_seed'] = [1, 0, 0] expected_hit_result['cluster_size'] = [3, 3, 3] expected_hit_result['n_cluster'] = [1, 1, 1] self.assertTrue(np.array_equal(cluster_hits, expected_hit_result))
def cluster_hits_niko(input_hits_file, output_cluster_file=None, input_disabled_pixel_mask_file=None, input_noisy_pixel_mask_file=None, min_hit_charge=0, max_hit_charge=None, column_cluster_distance=1, row_cluster_distance=1, frame_cluster_distance=1, dut_name=None, plot=True, chunk_size=1000000): '''Clusters the hits in the data file containing the hit table. Parameters ---------- input_hits_file : string Filename of the input hits file. output_cluster_file : string Filename of the output cluster file. If None, the filename will be derived from the input hits file. input_disabled_pixel_mask_file : string Filename of the input disabled mask file. input_noisy_pixel_mask_file : string Filename of the input disabled mask file. min_hit_charge : uint Minimum hit charge. Minimum possible hit charge must be given in order to correcly calculate the cluster coordinates. max_hit_charge : uint Maximum hit charge. Hits wit charge above the limit will be ignored. column_cluster_distance : uint Maximum column distance between hist so that they are assigned to the same cluster. Value of 0 effectively disables the clusterizer in column direction. row_cluster_distance : uint Maximum row distance between hist so that they are assigned to the same cluster. Value of 0 effectively disables the clusterizer in row direction. frame_cluster_distance : uint Sometimes an event has additional timing information (e.g. bunch crossing ID, frame ID). Value of 0 effectively disables the clusterization in time. dut_name : string Name of the DUT. If None, filename of the output cluster file will be used. plot : bool If True, create additional output plots. chunk_size : int Chunk size of the data when reading from file. ''' logging.info('=== Clustering hits in %s ===', input_hits_file) if output_cluster_file is None: output_cluster_file = os.path.splitext( input_hits_file)[0] + '_clustered.h5' # Get noisy and disabled pixel, they are excluded for clusters if input_disabled_pixel_mask_file is not None: with tb.open_file(input_disabled_pixel_mask_file, 'r') as input_mask_file_h5: disabled_pixels = np.dstack( np.nonzero( input_mask_file_h5.root.DisabledPixelMask[:]))[0] + 1 else: disabled_pixels = None if input_noisy_pixel_mask_file is not None: with tb.open_file(input_noisy_pixel_mask_file, 'r') as input_mask_file_h5: noisy_pixels = np.dstack( np.nonzero(input_mask_file_h5.root.NoisyPixelMask[:]))[0] + 1 else: noisy_pixels = None # Prepare clusterizer # Define end of cluster function to # calculate the size in col/row for each cluster def calc_cluster_dimensions(hits, clusters, cluster_size, cluster_hit_indices, cluster_index, cluster_id, charge_correction, noisy_pixels, disabled_pixels, seed_hit_index): min_col = hits[cluster_hit_indices[0]].column max_col = hits[cluster_hit_indices[0]].column min_row = hits[cluster_hit_indices[0]].row max_row = hits[cluster_hit_indices[0]].row for i in cluster_hit_indices[1:]: if i < 0: # Not used indeces = -1 break if hits[i].column < min_col: min_col = hits[i].column if hits[i].column > max_col: max_col = hits[i].column if hits[i].row < min_row: min_row = hits[i].row if hits[i].row > max_row: max_row = hits[i].row clusters[cluster_index].err_cols = max_col - min_col + 1 clusters[cluster_index].err_rows = max_row - min_row + 1 clusters[cluster_index]['trigger_time_stamp'] = hits[seed_hit_index][ 'trigger_time_stamp'] # for i in range(cluster_id +1): # clusters[cluster_hit_indices[i]]['trigger_time_stamp'] = hits[cluster_index]['trigger_time_stamp'] # Create clusterizer object with parameters clz = HitClusterizer(column_cluster_distance=column_cluster_distance, row_cluster_distance=row_cluster_distance, frame_cluster_distance=frame_cluster_distance, min_hit_charge=min_hit_charge, max_hit_charge=max_hit_charge) clz.set_hit_fields({'trigger_time_stamp': 'trigger_time_stamp' }) # 'relative_BCID': 'frame','tot': 'charge', clz.set_hit_dtype([('trigger_time_stamp', np.uint64) ]) # ('column', np.uint16),('tot', np.uint8), # Add an additional fields to hold the cluster size in x/y clz.add_cluster_field(description=('err_cols', '<f4')) clz.add_cluster_field(description=('err_rows', '<f4')) clz.add_cluster_field(description=('trigger_time_stamp', '<u8')) clz.set_end_of_cluster_function(calc_cluster_dimensions) # Run clusterizer on hit table in parallel on all cores def cluster_func(hits, clz, calc_cluster_dimensions): # Add cluster size calculation # EDGE CASE: the reference of an in time jitted function # does not seem to be pickled correctly when transfered # to the worker thread. Thus it has to be set here # manually. This might be solved in the future clz.set_hit_fields({'trigger_time_stamp': 'trigger_time_stamp' }) # 'relative_BCID': 'frame','charge': 'charge', clz.set_hit_dtype([('trigger_time_stamp', np.uint64) ]) #('column', np.uint16), ('charge', np.uint8), _, cl = clz.cluster_hits(hits, noisy_pixels=noisy_pixels, disabled_pixels=disabled_pixels) return cl smc.SMC(table_file_in=input_hits_file, file_out=output_cluster_file, func=cluster_func, func_kwargs={ 'clz': clz, 'calc_cluster_dimensions': calc_cluster_dimensions }, node_desc={'name': 'Cluster'}, align_at='event_number', table='Hits', chunk_size=chunk_size) # Calculate cluster size histogram def hist_func(cluster): n_hits = cluster['n_hits'] hist = analysis_utils.hist_1d_index(n_hits, shape=(np.max(n_hits) + 1, )) return hist smc.SMC(table_file_in=output_cluster_file, file_out=output_cluster_file[:-3] + '_hist.h5', func=hist_func, node_desc={'name': 'HistClusterSize'}, chunk_size=chunk_size) # Load infos from cluster size for error determination and plotting with tb.open_file(output_cluster_file[:-3] + '_hist.h5', 'r') as input_file_h5: hight = input_file_h5.root.HistClusterSize[:] n_clusters = hight.sum() n_hits = (hight * np.arange(0, hight.shape[0])).sum() max_cluster_size = hight.shape[0] - 1 # Calculate position error from cluster size def get_eff_pitch(hist, cluster_size): ''' Effective pitch to describe the cluster size propability distribution hist : array like Histogram with cluster size distribution cluster_size : Cluster size to calculate the pitch for ''' return np.sqrt(hight[int(cluster_size)].astype(np.float) / hight.sum()) def pos_error_func(clusters): # Check if end_of_cluster function was called # Under unknown and rare circumstances this might not be the case if not np.any(clusters['err_cols']): raise RuntimeError( 'Clustering failed, please report bug at:' 'https://github.com/SiLab-Bonn/testbeam_analysis/issues') # Set errors for small clusters, where charge sharing enhances # resolution for css in [(1, 1), (1, 2), (2, 1), (2, 2)]: sel = np.logical_and(clusters['err_cols'] == css[0], clusters['err_rows'] == css[1]) clusters['err_cols'][sel] = get_eff_pitch( hist=hight, cluster_size=css[0]) / np.sqrt(12) clusters['err_rows'][sel] = get_eff_pitch( hist=hight, cluster_size=css[1]) / np.sqrt(12) # Set errors for big clusters, where delta electrons reduce resolution sel = np.logical_or(clusters['err_cols'] > 2, clusters['err_rows'] > 2) clusters['err_cols'][sel] = clusters['err_cols'][sel] / np.sqrt(12) clusters['err_rows'][sel] = clusters['err_rows'][sel] / np.sqrt(12) return clusters smc.SMC(table_file_in=output_cluster_file, file_out=output_cluster_file, func=pos_error_func, chunk_size=chunk_size) # Copy masks to result cluster file with tb.open_file(output_cluster_file, 'r+') as output_file_h5: # Copy nodes to result file if input_disabled_pixel_mask_file is not None: with tb.open_file(input_disabled_pixel_mask_file, 'r') as input_mask_file_h5: input_mask_file_h5.root.DisabledPixelMask._f_copy( newparent=output_file_h5.root) if input_noisy_pixel_mask_file is not None: with tb.open_file(input_noisy_pixel_mask_file, 'r') as input_mask_file_h5: input_mask_file_h5.root.NoisyPixelMask._f_copy( newparent=output_file_h5.root) if plot: plot_cluster_size( output_cluster_file, dut_name=os.path.split(output_cluster_file)[1], output_pdf_file=os.path.splitext(output_cluster_file)[0] + '_cluster_size.pdf', chunk_size=chunk_size, gui=False) return output_cluster_file
if __name__ == "__main__": # A custom hit structure is defined here with unique names and data types hit_dtype = np.dtype([('event_number', '<i8'), ('frame', '<u1'), ('column', '<u2'), ('row', '<u2'), ('charge', '<u2')]) # Create some fake data hits = np.ones(shape=(3, ), dtype=hit_dtype) hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 17, 36, 11, 19 hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19 hits[2]['column'], hits[2]['row'], hits[2]['charge'], hits[2]['event_number'] = 7, 7, 1, 19 # Initialize clusterizer object clusterizer = HitClusterizer() clusterizer.add_cluster_field(description=('seed_charge', '<u1')) # Add an additional field to hold the charge of the seed hit # The end of loop function has to define all of the following arguments, even when they are not used # It has to be compile able by numba in non python mode # This end_of_cluster_function sets the additional seed_charge field def end_of_cluster_function(hits, cluster, is_seed, n_cluster, cluster_size, cluster_id, actual_cluster_index, actual_event_hit_index, actual_cluster_hit_indices, seed_index): cluster[actual_cluster_index].seed_charge = hits[seed_index].charge clusterizer.set_end_of_cluster_function(end_of_cluster_function) # Set the new function to the clusterizer # Main function cluster_hits, cluster = clusterizer.cluster_hits(hits) # cluster hits # Print input / output histograms print('INPUT:')
'ID': 'ID', 'seed_column': 'seed_column', 'seed_row': 'seed_row', 'mean_column': 'mean_column', 'mean_row': 'mean_row' } # Create some fake data hits = np.ones(shape=(3, ), dtype=hit_dtype) hits[0]['x'], hits[0]['y'], hits[0]['tot'], hits[0]['timestamp'] = 17, 36, 7, 1.0 hits[1]['x'], hits[1]['y'], hits[1]['tot'], hits[1]['timestamp'] = 18, 36, 6, 1.0 hits[2]['x'], hits[2]['y'], hits[2]['tot'], hits[2]['timestamp'] = 7, 7, 1, 1.1 # Initialize clusterizer object clusterizer = HitClusterizer(hit_fields=hit_fields, hit_dtype=hit_dtype, cluster_fields=cluster_fields, cluster_dtype=cluster_dtype) # Main function cluster_hits, cluster = clusterizer.cluster_hits(hits) # cluster hits # Print input / output histograms print('INPUT:') pprint_array(hits) print('OUTPUT:') print('Hits with cluster info:') pprint_array(cluster_hits) print('Cluster info:') pprint_array(cluster)