def test_different_hit_data_types(self):
        # Define a different hit data structure with standard names but different data types and number of fields. Numba automatically recompiles and the result should not change
        hit_data_types = []
        hit_data_types.append([('event_number', '<i8'),
                               ('frame', '<u1'),
                               ('column', '<u4'),
                               ('row', '<u4'),
                               ('charge', '<u1'),
                               ('parameter', '<i4')])
        hit_data_types.append([('event_number', '<i4'),
                               ('frame', '<u8'),
                               ('column', '<u2'),
                               ('row', '<i2'),
                               ('charge', '<u1'),
                               ('parameter', '<i4'),
                               ('parameter2', 'f4')])

        # Initialize clusterizer
        clusterizer = HitClusterizer(pure_python=self.pure_python)

        # Define expected output
        expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'),
                                                                        ('ID', '<u2'),
                                                                        ('n_hits', '<u2'),
                                                                        ('charge', 'f4'),
                                                                        ('seed_column', '<u2'),
                                                                        ('seed_row', '<u2'),
                                                                        ('mean_column', 'f4'),
                                                                        ('mean_row', 'f4')]))
        expected_cluster_result['event_number'] = [0, 1, 2, 3]
        expected_cluster_result['n_hits'] = [3, 3, 3, 1]
        expected_cluster_result['charge'] = [1, 2, 1, 1]
        expected_cluster_result['seed_column'] = [2, 4, 8, 10]
        expected_cluster_result['seed_row'] = [3, 7, 15, 19]
        expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0]
        expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0]

        for hit_data_type in hit_data_types:
            clusterizer.set_hit_dtype(np.dtype(hit_data_type))
            # Create fake data with actual hit data structure
            hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2, hit_dtype=np.dtype(hit_data_type))
            # Define expected output. Cluster hit data types are different and thus the expected results have to have different data types
            hit_data_type.extend([('cluster_ID', '<i2'),
                                  ('is_seed', '<u1'),
                                  ('cluster_size', '<u2'),
                                  ('n_cluster', '<u2')])
            expected_hit_result = np.zeros(shape=(10, ), dtype=hit_data_type)
            expected_hit_result['event_number'] = hits['event_number']
            expected_hit_result['frame'] = hits['frame']
            expected_hit_result['column'] = hits['column']
            expected_hit_result['row'] = hits['row']
            expected_hit_result['charge'] = hits['charge']
            expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
            expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1]
            expected_hit_result['n_cluster'] = 1

            hits_clustered, clusters = clusterizer.cluster_hits(hits)  # Cluster hits
            # Test results
            self.assertTrue((clusters == expected_cluster_result).all())
            self.assertTrue((hits_clustered == expected_hit_result).all())
    def test_custom_cluster_fields(self):
        # Define a different cluster data structure with different names but standard data types.
        cluster_dtype = np.dtype([('eventNumber', '<i8'),
                                  ('ID', '<u2'),
                                  ('size', '<u2'),
                                  ('tot', 'f4'),
                                  ('seed_column', '<u2'),
                                  ('seed_row', '<u2'),
                                  ('mean_column', 'f4'),
                                  ('mean_row', 'f4')])

        cluster_fields = {'eventNumber': 'event_number',
                          'ID': 'ID',
                          'size': 'n_hits',
                          'tot': 'charge',
                                'seed_column': 'seed_column',
                                'seed_row': 'seed_row',
                                'mean_column': 'mean_column',
                                'mean_row': 'mean_row'
                          }

        # Initialize clusterizer and cluster test hits with self defined data type names
        clusterizer = HitClusterizer(cluster_fields=cluster_fields, cluster_dtype=cluster_dtype, pure_python=self.pure_python)
        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)
        hits_clustered, clusters = clusterizer.cluster_hits(hits)

        # Define expected output
        expected_cluster_result = np.zeros(shape=(4, ), dtype=cluster_dtype)
        expected_cluster_result['eventNumber'] = [0, 1, 2, 3]
        expected_cluster_result['size'] = [3, 3, 3, 1]
        expected_cluster_result['tot'] = [1, 2, 1, 1]
        expected_cluster_result['seed_column'] = [2, 4, 8, 10]
        expected_cluster_result['seed_row'] = [3, 7, 15, 19]
        expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0]
        expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0]

        # Define expected output. Cluster hit data types are different and thus the expected results have to have different data types
        expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'),
                                                                     ('frame', '<u1'),
                                                                     ('column', '<u2'),
                                                                     ('row', '<u2'),
                                                                     ('charge', '<u2'),
                                                                     ('cluster_ID', '<i2'),
                                                                     ('is_seed', '<u1'),
                                                                     ('cluster_size', '<u2'),
                                                                     ('n_cluster', '<u2')]))
        expected_hit_result['event_number'] = hits['event_number']
        expected_hit_result['frame'] = hits['frame']
        expected_hit_result['column'] = hits['column']
        expected_hit_result['row'] = hits['row']
        expected_hit_result['charge'] = hits['charge']
        expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
        expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1]
        expected_hit_result['n_cluster'] = 1

        self.assertTrue((clusters == expected_cluster_result).all())
        self.assertTrue((hits_clustered == expected_hit_result).all())
    def test_adding_cluster_field(self):
        clusterizer = HitClusterizer(pure_python=self.pure_python)

        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)

        # Define expected cluster output with extra field
        expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'),
                                                                        ('ID', '<u2'),
                                                                        ('n_hits', '<u2'),
                                                                        ('charge', 'f4'),
                                                                        ('seed_column', '<u2'),
                                                                        ('seed_row', '<u2'),
                                                                        ('mean_column', 'f4'),
                                                                        ('mean_row', 'f4'),
                                                                        ('extra_field', 'f4')]))
        expected_cluster_result['event_number'] = [0, 1, 2, 3]
        expected_cluster_result['n_hits'] = [3, 3, 3, 1]
        expected_cluster_result['charge'] = [1, 2, 1, 1]
        expected_cluster_result['seed_column'] = [2, 4, 8, 10]
        expected_cluster_result['seed_row'] = [3, 7, 15, 19]
        expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0]
        expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0]
        expected_cluster_result['extra_field'] = [0., 0., 0., 0.]

        # Define expected hit clustered output
        expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'),
                                                                     ('frame', '<u1'),
                                                                     ('column', '<u2'),
                                                                     ('row', '<u2'),
                                                                     ('charge', '<u2'),
                                                                     ('cluster_ID', '<i2'),
                                                                     ('is_seed', '<u1'),
                                                                     ('cluster_size', '<u2'),
                                                                     ('n_cluster', '<u2')]))
        expected_hit_result['event_number'] = hits['event_number']
        expected_hit_result['frame'] = hits['frame']
        expected_hit_result['column'] = hits['column']
        expected_hit_result['row'] = hits['row']
        expected_hit_result['charge'] = hits['charge']
        expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
        expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1]
        expected_hit_result['n_cluster'] = 1

        clusterizer.add_cluster_field(description=('extra_field', 'f4'))
        hits_clustered, clusters = clusterizer.cluster_hits(hits)

        self.assertTrue((clusters == expected_cluster_result).all())
        self.assertTrue((hits_clustered == expected_hit_result).all())
    def test_set_end_of_cluster_function(self):
        # Initialize clusterizer object
        clusterizer = HitClusterizer(pure_python=self.pure_python)

        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)

        # Define expected output
        expected_cluster_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'),
                                                                        ('ID', '<u2'),
                                                                        ('n_hits', '<u2'),
                                                                        ('charge', 'f4'),
                                                                        ('seed_column', '<u2'),
                                                                        ('seed_row', '<u2'),
                                                                        ('mean_column', 'f4'),
                                                                        ('mean_row', 'f4'),
                                                                        ('seed_charge', 'f4')]))
        expected_cluster_result['event_number'] = [0, 1, 2, 3]
        expected_cluster_result['n_hits'] = [3, 3, 3, 1]
        expected_cluster_result['charge'] = [1, 2, 1, 1]
        expected_cluster_result['seed_column'] = [2, 4, 8, 10]
        expected_cluster_result['seed_row'] = [3, 7, 15, 19]
        expected_cluster_result['mean_column'] = [2.0, 5.0, 8.0, 10.0]
        expected_cluster_result['mean_row'] = [3.0, 9.0, 15.0, 19.0]
        expected_cluster_result['seed_charge'] = [1., 1., 1., 1.]

        #
        expected_hit_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'),
                                                                     ('frame', '<u1'),
                                                                     ('column', '<u2'),
                                                                     ('row', '<u2'),
                                                                     ('charge', '<u2'),
                                                                     ('cluster_ID', '<i2'),
                                                                     ('is_seed', '<u1'),
                                                                     ('cluster_size', '<u2'),
                                                                     ('n_cluster', '<u2')]))
        expected_hit_result['event_number'] = hits['event_number']
        expected_hit_result['frame'] = hits['frame']
        expected_hit_result['column'] = hits['column']
        expected_hit_result['row'] = hits['row']
        expected_hit_result['charge'] = hits['charge']
        expected_hit_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
        expected_hit_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1]
        expected_hit_result['n_cluster'] = 1

        clusterizer.add_cluster_field(description=('seed_charge', 'f4'))  # Add an additional field to hold the result of the end_of_cluster_function calculation (here: seed charge)

        # The end of loop function has to define all of the following arguments, even when they are not used
        # It has to be compile able by numba in non python mode
        # This end_of_cluster_function sets the additional seed_charge field
        def end_of_cluster_function(hits, cluster, is_seed, n_cluster, cluster_size, cluster_id, actual_cluster_index, actual_event_hit_index, actual_cluster_hit_indices, seed_index):
            cluster[actual_cluster_index]['seed_charge'] = hits[seed_index]['charge']

        clusterizer.set_end_of_cluster_function(end_of_cluster_function)  # Set the new end_of_cluster_function

        # Main function
        hits_clustered, cluster = clusterizer.cluster_hits(hits)  # cluster hits

        self.assertTrue((cluster == expected_cluster_result).all())
        self.assertTrue((hits_clustered == expected_hit_result).all())
    def test_chunked_clustering(self):  # Big tables have to be chunked and analyzed with clusterizer.cluster_hits(hits_chunk) calls
        clusterizer = HitClusterizer(pure_python=self.pure_python)

        hits = create_hits(n_hits=100, max_column=100, max_row=100, max_frame=1, max_charge=2)

        hits_clustered, cluster = clusterizer.cluster_hits(hits)  # Cluster all at once
        hits_clustered, cluster = hits_clustered.copy(), cluster.copy()  # Be aware that the returned array are references to be stored! An additional call of clusterizer.cluster_hits will overwrite the data

        hits_clustered_chunked, cluster_chunked = None, None
        chunk_size = 6  # Chunk size has to be chosen to not split events between chunks!
        for i in range(int(100 / chunk_size + 1)):  # Cluster in chunks
            hits_chunk = hits[i * chunk_size:i * chunk_size + chunk_size]
            hits_clustered_chunk, cluster_chunk = clusterizer.cluster_hits(hits_chunk)
            if hits_clustered_chunked is None:
                hits_clustered_chunked = hits_clustered_chunk.copy()
            else:
                hits_clustered_chunked = np.append(hits_clustered_chunked, hits_clustered_chunk)
            if cluster_chunked is None:
                cluster_chunked = cluster_chunk.copy()
            else:
                cluster_chunked = np.append(cluster_chunked, cluster_chunk)

        self.assertTrue((hits_clustered == hits_clustered_chunked).all())
        self.assertTrue((cluster == cluster_chunked).all())
    def test_exceptions(self):
        # TEST 1: Check to add more hits than supported
        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)

        clusterizer = HitClusterizer(pure_python=self.pure_python)
        clusterizer.set_max_cluster_hits(1)
        with self.assertRaises(OutOfRangeError):
            clusterizer.cluster_hits(hits)
        # TEST 2: Set Custom mapping that is correct and should not throw an exception
        hit_mapping = {'event_number': 'event_number',
                       'column': 'column',
                       'row': 'row',
                       'charge': 'charge',
                       'frame': 'frame'
                       }
        hit_dtype = np.dtype([('event_number', '<i8'),
                              ('frame', '<u1'),
                              ('column', '<u2'),
                              ('row', '<u2'),
                              ('charge', '<u2')])
        clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype, pure_python=self.pure_python)
        # TEST 3: Set custom clustered hit struct that is incorrect and should throw an exception
        hit_dtype_new = np.dtype([('not_defined', '<i8'),
                                  ('frame', '<u1'),
                                  ('column', '<u2'),
                                  ('row', '<u2'),
                                  ('charge', '<u2')])
        with self.assertRaises(ValueError):
            clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype_new, pure_python=self.pure_python)
        # TEST 4 Set custom and correct hit mapping, no eception expected
        hit_mapping = {'not_defined': 'event_number',
                       'column': 'column',
                       'row': 'row',
                       'charge': 'charge',
                       'frame': 'frame'
                       }
        clusterizer = HitClusterizer(hit_fields=hit_mapping, hit_dtype=hit_dtype_new, pure_python=self.pure_python)
예제 #7
0
def cluster_hits(input_hits_file,
                 output_cluster_file=None,
                 max_x_distance=3,
                 max_y_distance=3,
                 max_time_distance=2,
                 dut_name=None,
                 plot=True,
                 max_cluster_hits=1000,
                 max_hit_charge=13,
                 chunk_size=1000000):
    '''Clusters the hits in the data file containing the hit table.

    Parameters
    ----------
    data_file : pytables file
    output_file : pytables file
    '''

    logging.info('=== Cluster hits in %s ===', input_hits_file)

    if not output_cluster_file:
        output_cluster_file = os.path.splitext(
            input_hits_file)[0] + '_cluster.h5'

    with tb.open_file(input_hits_file, 'r') as input_file_h5:
        with tb.open_file(output_cluster_file, 'w') as output_file_h5:
            # create clusterizer object
            clusterizer = HitClusterizer()
            clusterizer.set_max_hits(chunk_size)
            clusterizer.set_max_cluster_hits(max_cluster_hits)
            clusterizer.set_max_hit_charge(max_hit_charge)

            # Set clusterzier settings
            clusterizer.create_cluster_hit_info_array(
                False)  # do not create cluster infos for hits
            clusterizer.set_x_cluster_distance(
                max_x_distance)  # cluster distance in columns
            clusterizer.set_y_cluster_distance(
                max_y_distance)  # cluster distance in rows
            clusterizer.set_frame_cluster_distance(
                max_time_distance)  # cluster distance in time frames

            # Output data
            cluster_table_description = np.dtype([('event_number', '<i8'),
                                                  ('ID', '<u2'),
                                                  ('n_hits', '<u2'),
                                                  ('charge', 'f4'),
                                                  ('seed_column', '<u2'),
                                                  ('seed_row', '<u2'),
                                                  ('mean_column', 'f4'),
                                                  ('mean_row', 'f4')])
            cluster_table_out = output_file_h5.create_table(
                output_file_h5.root,
                name='Cluster',
                description=cluster_table_description,
                title='Clustered hits',
                filters=tb.Filters(complib='blosc',
                                   complevel=5,
                                   fletcher32=False))

            for hits, _ in analysis_utils.data_aligned_at_events(
                    input_file_h5.root.Hits,
                    chunk_size=chunk_size,
                    try_speedup=False):
                if not np.all(np.diff(hits['event_number']) >= 0):
                    raise RuntimeError(
                        'The event number does not always increase. The hits cannot be used like this!'
                    )
                __, cluster = clusterizer.cluster_hits(hits)  # Cluster hits
                if not np.all(np.diff(cluster['event_number']) >= 0):
                    raise RuntimeError(
                        'The event number does not always increase. The cluster cannot be used like this!'
                    )
                cluster_table_out.append(cluster)

    if plot:
        plot_cluster_size(input_cluster_file=output_cluster_file,
                          dut_name=dut_name)

    return output_cluster_file
    def test_cluster_algorithm(self):  # Check with multiple jumps data
        # Inititalize Clusterizer
        clusterizer = HitClusterizer(pure_python=self.pure_python)

        # TEST 1
        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)

        clusterizer.cluster_hits(hits)  # cluster hits
        _, clusters = clusterizer.get_hit_cluster(), clusterizer.get_cluster()

        # Define expected output
        expected_result = np.zeros(shape=(4, ), dtype=np.dtype([('event_number', '<i8'),
                                                                ('ID', '<u2'),
                                                                ('n_hits', '<u2'),
                                                                ('charge', 'f4'),
                                                                ('seed_column', '<u2'),
                                                                ('seed_row', '<u2'),
                                                                ('mean_column', 'f4'),
                                                                ('mean_row', 'f4')]))
        expected_result['event_number'] = [0, 1, 2, 3]
        expected_result['n_hits'] = [3, 3, 3, 1]
        expected_result['charge'] = [1, 2, 1, 1]
        expected_result['seed_column'] = [2, 4, 8, 10]
        expected_result['seed_row'] = [3, 7, 15, 19]
        expected_result['mean_column'] = [2.0, 5.0, 8.0, 10.0]
        expected_result['mean_row'] = [3.0, 9.0, 15.0, 19.0]

        # Test results
        self.assertTrue((clusters == expected_result).all())

        # TEST 2
        clusterizer.create_cluster_hit_info_array(True)
        hits = create_hits(n_hits=10, max_column=100, max_row=100, max_frame=1, max_charge=2)

        clusterizer.cluster_hits(hits)  # cluster hits
        cluster_hits, clusters = clusterizer.get_hit_cluster(), clusterizer.get_cluster()

        # Define expected output
        expected_result = np.zeros(shape=(10, ), dtype=np.dtype([('event_number', '<i8'),
                                                                 ('frame', '<u1'),
                                                                 ('column', '<u2'),
                                                                 ('row', '<u2'),
                                                                 ('charge', '<u2'),
                                                                 ('cluster_ID', '<i2'),
                                                                 ('is_seed', '<u1'),
                                                                 ('cluster_size', '<u2'),
                                                                 ('n_cluster', '<u2')]))
        expected_result['event_number'] = hits['event_number']
        expected_result['frame'] = hits['frame']
        expected_result['column'] = hits['column']
        expected_result['row'] = hits['row']
        expected_result['charge'] = hits['charge']
        expected_result['is_seed'] = [0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
        expected_result['cluster_size'] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 1]
        expected_result['n_cluster'] = 1

        # Test results
        self.assertEqual(cluster_hits.shape[0], 10)  # hit clustering activated, thus this array have 10 entries
        self.assertTrue(np.array_equal(cluster_hits, expected_result))
    def test_cluster_cuts(self):
        # Create some fake data
        hits = np.ones(shape=(2, ), dtype=np.dtype([('event_number', '<i8'),
                                                    ('frame', '<u1'),
                                                    ('column', '<u2'),
                                                    ('row', '<u2'),
                                                    ('charge', '<u2')]))
        hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 17, 36, 30, 19
        hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19

        # Create clusterizer object
        clusterizer = HitClusterizer(pure_python=self.pure_python)
        clusterizer.create_cluster_hit_info_array(True)

        # Case 1: Test max hit charge cut, accept all hits
        clusterizer.set_max_hit_charge(30)  # only add hits with charge <= 30
        clusterizer.cluster_hits(hits)  # cluster hits

        # Check cluster
        cluster = clusterizer.get_cluster()
        expected_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'),
                                                                ('ID', '<u2'),
                                                                ('n_hits', '<u2'),
                                                                ('charge', 'f4'),
                                                                ('seed_column', '<u2'),
                                                                ('seed_row', '<u2'),
                                                                ('mean_column', 'f4'),
                                                                ('mean_row', 'f4')]))
        expected_result['event_number'] = [19]
        expected_result['n_hits'] = [2]
        expected_result['charge'] = [36]
        expected_result['seed_column'] = [17]
        expected_result['seed_row'] = [36]
        expected_result['mean_column'] = [17.18420982]
        expected_result['mean_row'] = [36.0]

        self.assertTrue(np.array_equal(cluster, expected_result))

        # Check cluster hit info
        cluster_hits = clusterizer.get_hit_cluster()
        expected_result = np.zeros(shape=(2, ), dtype=np.dtype([('event_number', '<i8'),
                                                                ('frame', '<u1'),
                                                                ('column', '<u2'),
                                                                ('row', '<u2'),
                                                                ('charge', '<u2'),
                                                                ('cluster_ID', '<i2'),
                                                                ('is_seed', '<u1'),
                                                                ('cluster_size', '<u2'),
                                                                ('n_cluster', '<u2')]))
        expected_result['event_number'] = hits['event_number']
        expected_result['frame'] = hits['frame']
        expected_result['column'] = hits['column']
        expected_result['row'] = hits['row']
        expected_result['charge'] = hits['charge']
        expected_result['is_seed'] = [1, 0]
        expected_result['cluster_size'] = [2, 2]
        expected_result['n_cluster'] = 1

        self.assertTrue(np.array_equal(cluster_hits, expected_result))

        # Case 2: Test max hit charge cut, omit charge > 29 hits
        hits['event_number'] = 20
        clusterizer.set_max_hit_charge(29)  # only add hits with charge <= 30
        clusterizer.cluster_hits(hits)  # cluster hits
        # Check cluster
        cluster = clusterizer.get_cluster()
        expected_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'),
                                                                ('ID', '<u2'),
                                                                ('n_hits', '<u2'),
                                                                ('charge', 'f4'),
                                                                ('seed_column', '<u2'),
                                                                ('seed_row', '<u2'),
                                                                ('mean_column', 'f4'),
                                                                ('mean_row', 'f4')]))
        expected_result['event_number'] = [20]
        expected_result['n_hits'] = [1]
        expected_result['charge'] = [6]
        expected_result['seed_column'] = [18]
        expected_result['seed_row'] = [36]
        expected_result['mean_column'] = [18.0]
        expected_result['mean_row'] = [36.0]
        self.assertTrue(np.array_equal(cluster, expected_result))

        # Check cluster hit info
        cluster_hits = clusterizer.get_hit_cluster()
        expected_result = np.zeros(shape=(2, ), dtype=np.dtype([('event_number', '<i8'),
                                                                ('frame', '<u1'),
                                                                ('column', '<u2'),
                                                                ('row', '<u2'),
                                                                ('charge', '<u2'),
                                                                ('cluster_ID', '<i2'),
                                                                ('is_seed', '<u1'),
                                                                ('cluster_size', '<u2'),
                                                                ('n_cluster', '<u2')]))
        expected_result['event_number'] = hits['event_number']
        expected_result['frame'] = hits['frame']
        expected_result['column'] = hits['column']
        expected_result['row'] = hits['row']
        expected_result['charge'] = hits['charge']
        expected_result['cluster_ID'] = [-1, 0]
        expected_result['is_seed'] = [0, 1]
        expected_result['cluster_size'] = [0, 1]
        expected_result['n_cluster'] = [1, 1]

        self.assertTrue(np.array_equal(cluster_hits, expected_result))

        # Case 3: Add the same hit within an event
        # Create some fake data
        hits = np.ones(shape=(3, ), dtype=np.dtype([('event_number', '<i8'),
                                                    ('frame', '<u1'),
                                                    ('column', '<u2'),
                                                    ('row', '<u2'),
                                                    ('charge', '<u2')]))
        hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 18, 36, 6, 19
        hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19
        hits[2]['column'], hits[2]['row'], hits[2]['charge'], hits[2]['event_number'] = 18, 38, 6, 19

        expected_hit_result = np.zeros(shape=(3, ), dtype=np.dtype([('event_number', '<i8'),
                                                                    ('frame', '<u1'),
                                                                    ('column', '<u2'),
                                                                    ('row', '<u2'),
                                                                    ('charge', '<u2'),
                                                                    ('cluster_ID', '<i2'),
                                                                    ('is_seed', '<u1'),
                                                                    ('cluster_size', '<u2'),
                                                                    ('n_cluster', '<u2')]))
        expected_cluster_result = np.zeros(shape=(1, ), dtype=np.dtype([('event_number', '<i8'),
                                                                        ('ID', '<u2'),
                                                                        ('n_hits', '<u2'),
                                                                        ('charge', 'f4'),
                                                                        ('seed_column', '<u2'),
                                                                        ('seed_row', '<u2'),
                                                                        ('mean_column', 'f4'),
                                                                        ('mean_row', 'f4')]))
        expected_hit_result['event_number'] = hits['event_number']
        expected_hit_result['frame'] = hits['frame']
        expected_hit_result['column'] = hits['column']
        expected_hit_result['row'] = hits['row']
        expected_hit_result['charge'] = hits['charge']
        expected_hit_result['cluster_ID'] = [0, -2, 0]
        expected_hit_result['is_seed'] = [1, 0, 0]
        expected_hit_result['cluster_size'] = [2, 0, 2]
        expected_hit_result['n_cluster'] = [1, 1, 1]
        expected_cluster_result['event_number'] = [19]
        expected_cluster_result['n_hits'] = [2]
        expected_cluster_result['charge'] = [12]
        expected_cluster_result['seed_column'] = [18]
        expected_cluster_result['seed_row'] = [36]
        expected_cluster_result['mean_column'] = [18.0]
        expected_cluster_result['mean_row'] = [37.0]

        clusterizer.ignore_same_hits(True)  # If a hit occured 2 times in an event it is ignored and gets the cluster index -2
        cluster_hits, cluster = clusterizer.cluster_hits(hits)  # Cluster hits

        self.assertTrue(np.array_equal(cluster_hits, expected_hit_result))
        self.assertTrue(np.array_equal(cluster, expected_cluster_result))

        clusterizer.ignore_same_hits(False)  # If a hit occured 2 times in an event it is used as a normal hit
        cluster_hits, cluster = clusterizer.cluster_hits(hits)  # Cluster hits

        expected_hit_result['cluster_ID'] = [0, 0, 0]
        expected_hit_result['is_seed'] = [1, 0, 0]
        expected_hit_result['cluster_size'] = [3, 3, 3]
        expected_hit_result['n_cluster'] = [1, 1, 1]

        self.assertTrue(np.array_equal(cluster_hits, expected_hit_result))
예제 #10
0
def cluster_hits_niko(input_hits_file,
                      output_cluster_file=None,
                      input_disabled_pixel_mask_file=None,
                      input_noisy_pixel_mask_file=None,
                      min_hit_charge=0,
                      max_hit_charge=None,
                      column_cluster_distance=1,
                      row_cluster_distance=1,
                      frame_cluster_distance=1,
                      dut_name=None,
                      plot=True,
                      chunk_size=1000000):
    '''Clusters the hits in the data file containing the hit table.

    Parameters
    ----------
    input_hits_file : string
        Filename of the input hits file.
    output_cluster_file : string
        Filename of the output cluster file. If None, the filename will be derived from the input hits file.
    input_disabled_pixel_mask_file : string
        Filename of the input disabled mask file.
    input_noisy_pixel_mask_file : string
        Filename of the input disabled mask file.
    min_hit_charge : uint
        Minimum hit charge. Minimum possible hit charge must be given in order to correcly calculate the cluster coordinates.
    max_hit_charge : uint
        Maximum hit charge. Hits wit charge above the limit will be ignored.
    column_cluster_distance : uint
        Maximum column distance between hist so that they are assigned to the same cluster. Value of 0 effectively disables the clusterizer in column direction.
    row_cluster_distance : uint
        Maximum row distance between hist so that they are assigned to the same cluster. Value of 0 effectively disables the clusterizer in row direction.
    frame_cluster_distance : uint
        Sometimes an event has additional timing information (e.g. bunch crossing ID, frame ID). Value of 0 effectively disables the clusterization in time.
    dut_name : string
        Name of the DUT. If None, filename of the output cluster file will be used.
    plot : bool
        If True, create additional output plots.
    chunk_size : int
        Chunk size of the data when reading from file.
    '''
    logging.info('=== Clustering hits in %s ===', input_hits_file)

    if output_cluster_file is None:
        output_cluster_file = os.path.splitext(
            input_hits_file)[0] + '_clustered.h5'

    # Get noisy and disabled pixel, they are excluded for clusters
    if input_disabled_pixel_mask_file is not None:
        with tb.open_file(input_disabled_pixel_mask_file,
                          'r') as input_mask_file_h5:
            disabled_pixels = np.dstack(
                np.nonzero(
                    input_mask_file_h5.root.DisabledPixelMask[:]))[0] + 1
    else:
        disabled_pixels = None
    if input_noisy_pixel_mask_file is not None:
        with tb.open_file(input_noisy_pixel_mask_file,
                          'r') as input_mask_file_h5:
            noisy_pixels = np.dstack(
                np.nonzero(input_mask_file_h5.root.NoisyPixelMask[:]))[0] + 1
    else:
        noisy_pixels = None

    # Prepare clusterizer

    # Define end of cluster function to
    # calculate the size in col/row for each cluster
    def calc_cluster_dimensions(hits, clusters, cluster_size,
                                cluster_hit_indices, cluster_index, cluster_id,
                                charge_correction, noisy_pixels,
                                disabled_pixels, seed_hit_index):

        min_col = hits[cluster_hit_indices[0]].column
        max_col = hits[cluster_hit_indices[0]].column
        min_row = hits[cluster_hit_indices[0]].row
        max_row = hits[cluster_hit_indices[0]].row
        for i in cluster_hit_indices[1:]:
            if i < 0:  # Not used indeces = -1
                break
            if hits[i].column < min_col:
                min_col = hits[i].column
            if hits[i].column > max_col:
                max_col = hits[i].column
            if hits[i].row < min_row:
                min_row = hits[i].row
            if hits[i].row > max_row:
                max_row = hits[i].row
        clusters[cluster_index].err_cols = max_col - min_col + 1
        clusters[cluster_index].err_rows = max_row - min_row + 1
        clusters[cluster_index]['trigger_time_stamp'] = hits[seed_hit_index][
            'trigger_time_stamp']


#         for i in range(cluster_id +1):
#             clusters[cluster_hit_indices[i]]['trigger_time_stamp'] = hits[cluster_index]['trigger_time_stamp']

# Create clusterizer object with parameters

    clz = HitClusterizer(column_cluster_distance=column_cluster_distance,
                         row_cluster_distance=row_cluster_distance,
                         frame_cluster_distance=frame_cluster_distance,
                         min_hit_charge=min_hit_charge,
                         max_hit_charge=max_hit_charge)

    clz.set_hit_fields({'trigger_time_stamp': 'trigger_time_stamp'
                        })  # 'relative_BCID': 'frame','tot': 'charge',
    clz.set_hit_dtype([('trigger_time_stamp', np.uint64)
                       ])  # ('column', np.uint16),('tot', np.uint8),

    # Add an additional fields to hold the cluster size in x/y

    clz.add_cluster_field(description=('err_cols', '<f4'))
    clz.add_cluster_field(description=('err_rows', '<f4'))
    clz.add_cluster_field(description=('trigger_time_stamp', '<u8'))
    clz.set_end_of_cluster_function(calc_cluster_dimensions)

    # Run clusterizer on hit table in parallel on all cores
    def cluster_func(hits, clz, calc_cluster_dimensions):
        # Add cluster size calculation
        # EDGE CASE: the reference of an in time jitted function
        # does not seem to be pickled correctly when transfered
        # to the worker thread. Thus it has to be set here
        # manually. This might be solved in the future

        clz.set_hit_fields({'trigger_time_stamp': 'trigger_time_stamp'
                            })  # 'relative_BCID': 'frame','charge': 'charge',
        clz.set_hit_dtype([('trigger_time_stamp', np.uint64)
                           ])  #('column', np.uint16), ('charge', np.uint8),

        _, cl = clz.cluster_hits(hits,
                                 noisy_pixels=noisy_pixels,
                                 disabled_pixels=disabled_pixels)
        return cl

    smc.SMC(table_file_in=input_hits_file,
            file_out=output_cluster_file,
            func=cluster_func,
            func_kwargs={
                'clz': clz,
                'calc_cluster_dimensions': calc_cluster_dimensions
            },
            node_desc={'name': 'Cluster'},
            align_at='event_number',
            table='Hits',
            chunk_size=chunk_size)

    # Calculate cluster size histogram
    def hist_func(cluster):
        n_hits = cluster['n_hits']
        hist = analysis_utils.hist_1d_index(n_hits,
                                            shape=(np.max(n_hits) + 1, ))
        return hist

    smc.SMC(table_file_in=output_cluster_file,
            file_out=output_cluster_file[:-3] + '_hist.h5',
            func=hist_func,
            node_desc={'name': 'HistClusterSize'},
            chunk_size=chunk_size)

    # Load infos from cluster size for error determination and plotting
    with tb.open_file(output_cluster_file[:-3] + '_hist.h5',
                      'r') as input_file_h5:
        hight = input_file_h5.root.HistClusterSize[:]
        n_clusters = hight.sum()
        n_hits = (hight * np.arange(0, hight.shape[0])).sum()
        max_cluster_size = hight.shape[0] - 1

    # Calculate position error from cluster size
    def get_eff_pitch(hist, cluster_size):
        ''' Effective pitch to describe the cluster
            size propability distribution

        hist : array like
            Histogram with cluster size distribution
        cluster_size : Cluster size to calculate the pitch for
        '''

        return np.sqrt(hight[int(cluster_size)].astype(np.float) / hight.sum())

    def pos_error_func(clusters):
        # Check if end_of_cluster function was called
        # Under unknown and rare circumstances this might not be the case
        if not np.any(clusters['err_cols']):
            raise RuntimeError(
                'Clustering failed, please report bug at:'
                'https://github.com/SiLab-Bonn/testbeam_analysis/issues')
        # Set errors for small clusters, where charge sharing enhances
        # resolution
        for css in [(1, 1), (1, 2), (2, 1), (2, 2)]:
            sel = np.logical_and(clusters['err_cols'] == css[0],
                                 clusters['err_rows'] == css[1])
            clusters['err_cols'][sel] = get_eff_pitch(
                hist=hight, cluster_size=css[0]) / np.sqrt(12)
            clusters['err_rows'][sel] = get_eff_pitch(
                hist=hight, cluster_size=css[1]) / np.sqrt(12)
        # Set errors for big clusters, where delta electrons reduce resolution
        sel = np.logical_or(clusters['err_cols'] > 2, clusters['err_rows'] > 2)
        clusters['err_cols'][sel] = clusters['err_cols'][sel] / np.sqrt(12)
        clusters['err_rows'][sel] = clusters['err_rows'][sel] / np.sqrt(12)

        return clusters

    smc.SMC(table_file_in=output_cluster_file,
            file_out=output_cluster_file,
            func=pos_error_func,
            chunk_size=chunk_size)

    # Copy masks to result cluster file
    with tb.open_file(output_cluster_file, 'r+') as output_file_h5:
        # Copy nodes to result file
        if input_disabled_pixel_mask_file is not None:
            with tb.open_file(input_disabled_pixel_mask_file,
                              'r') as input_mask_file_h5:
                input_mask_file_h5.root.DisabledPixelMask._f_copy(
                    newparent=output_file_h5.root)
        if input_noisy_pixel_mask_file is not None:
            with tb.open_file(input_noisy_pixel_mask_file,
                              'r') as input_mask_file_h5:
                input_mask_file_h5.root.NoisyPixelMask._f_copy(
                    newparent=output_file_h5.root)

    if plot:
        plot_cluster_size(
            output_cluster_file,
            dut_name=os.path.split(output_cluster_file)[1],
            output_pdf_file=os.path.splitext(output_cluster_file)[0] +
            '_cluster_size.pdf',
            chunk_size=chunk_size,
            gui=False)

    return output_cluster_file
if __name__ == "__main__":
    # A custom hit structure is defined here with unique names and data types
    hit_dtype = np.dtype([('event_number', '<i8'),
                          ('frame', '<u1'),
                          ('column', '<u2'),
                          ('row', '<u2'),
                          ('charge', '<u2')])

    # Create some fake data
    hits = np.ones(shape=(3, ), dtype=hit_dtype)
    hits[0]['column'], hits[0]['row'], hits[0]['charge'], hits[0]['event_number'] = 17, 36, 11, 19
    hits[1]['column'], hits[1]['row'], hits[1]['charge'], hits[1]['event_number'] = 18, 36, 6, 19
    hits[2]['column'], hits[2]['row'], hits[2]['charge'], hits[2]['event_number'] = 7, 7, 1, 19

    # Initialize clusterizer object
    clusterizer = HitClusterizer()
    clusterizer.add_cluster_field(description=('seed_charge', '<u1'))  # Add an additional field to hold the charge of the seed hit

    # The end of loop function has to define all of the following arguments, even when they are not used
    # It has to be compile able by numba in non python mode
    # This end_of_cluster_function sets the additional seed_charge field
    def end_of_cluster_function(hits, cluster, is_seed, n_cluster, cluster_size, cluster_id, actual_cluster_index, actual_event_hit_index, actual_cluster_hit_indices, seed_index):
        cluster[actual_cluster_index].seed_charge = hits[seed_index].charge

    clusterizer.set_end_of_cluster_function(end_of_cluster_function)  # Set the new function to the clusterizer

    # Main function
    cluster_hits, cluster = clusterizer.cluster_hits(hits)  # cluster hits

    # Print input / output histograms
    print('INPUT:')
                      'ID': 'ID',
                      'seed_column': 'seed_column',
                      'seed_row': 'seed_row',
                      'mean_column': 'mean_column',
                      'mean_row': 'mean_row'
                      }

    # Create some fake data
    hits = np.ones(shape=(3, ), dtype=hit_dtype)
    hits[0]['x'], hits[0]['y'], hits[0]['tot'], hits[0]['timestamp'] = 17, 36, 7, 1.0
    hits[1]['x'], hits[1]['y'], hits[1]['tot'], hits[1]['timestamp'] = 18, 36, 6, 1.0
    hits[2]['x'], hits[2]['y'], hits[2]['tot'], hits[2]['timestamp'] = 7, 7, 1, 1.1

    # Initialize clusterizer object
    clusterizer = HitClusterizer(hit_fields=hit_fields,
                                 hit_dtype=hit_dtype,
                                 cluster_fields=cluster_fields,
                                 cluster_dtype=cluster_dtype)

    # Main function
    cluster_hits, cluster = clusterizer.cluster_hits(hits)  # cluster hits

    # Print input / output histograms
    print('INPUT:')
    pprint_array(hits)
    print('OUTPUT:')
    print('Hits with cluster info:')
    pprint_array(cluster_hits)
    print('Cluster info:')
    pprint_array(cluster)