示例#1
0
def main():

    global dataset
    global db

    db = TrackDatabase(os.path.join(DATASET_FOLDER, 'dataset.hdf5'))
    dataset = Dataset(db, 'dataset')

    total_tracks = len(db.get_all_track_ids())

    tracks_loaded = dataset.load_tracks(track_filter)

    print("Loaded {}/{} tracks, found {:.1f}k segments".format(
        tracks_loaded, total_tracks,
        len(dataset.segments) / 1000))
    for key, value in filtered_stats.items():
        if value != 0:
            print("  {} filtered {}".format(key, value))
    print()

    labels = sorted(list(set(dataset.tracks_by_label.keys())))
    dataset.labels = labels

    show_tracks_breakdown()
    print()
    show_segments_breakdown()
    print()
    show_cameras_breakdown()
    print()

    print("Splitting data set into train / validation")
    if USE_PREVIOUS_SPLIT:
        split = get_bin_split('template.dat')
        datasets = split_dataset_days(split)
    else:
        datasets = split_dataset_days()

    pickle.dump(datasets,
                open(os.path.join(DATASET_FOLDER, 'datasets.dat'), 'wb'))
class CPTVTrackExtractor(CPTVFileProcessor):
    """
    Handles extracting tracks from CPTV files.
    Maintains a database recording which files have already been processed, and some statistics parameters used
    during processing.
    """

    def __init__(self, config, tracker_config):

        CPTVFileProcessor.__init__(self, config, tracker_config)

        self.hints = {}
        self.enable_track_output = True
        self.compression = (
            blosc_zstd if self.config.extract.enable_compression else None
        )

        self.previewer = Previewer.create_if_required(config, config.extract.preview)

        # normally poor quality tracks are filtered out, enabling this will let them through.
        self.disable_track_filters = False
        # disables background subtraction
        self.disable_background_subtraction = False

        os.makedirs(self.config.tracks_folder, mode=0o775, exist_ok=True)
        self.database = TrackDatabase(
            os.path.join(self.config.tracks_folder, "dataset.hdf5")
        )

        # load hints.  Hints are a way to give extra information to the tracker when necessary.
        # if os.path.exists(config.extract.hints_file):
        if config.extract.hints_file:
            self.load_hints(config.extract.hints_file)

    def load_hints(self, filename):
        """ Read in hints file from given path.  If file is not found an empty hints dictionary set."""

        self.hints = {}

        if not os.path.exists(filename):
            logging.warning("Failed to load hints file: %s", filename)
            return

        f = open(filename)
        for line_number, line in enumerate(f):
            line = line.strip()
            # comments
            if line == "" or line[0] == "#":
                continue
            try:
                (filename, file_max_tracks) = line.split()[:2]
            except:
                raise Exception("Error on line {0}: {1}".format(line_number, line))
            self.hints[filename] = int(file_max_tracks)

    def process_all(self, root):
        if root is None:
            root = self.config.source_folder

        previous_filter_setting = self.disable_track_filters
        previous_background_setting = self.disable_background_subtraction
        for folder_root, folders, _ in os.walk(root):

            for folder in folders:
                if folder not in self.config.excluded_folders:
                    if folder.lower() == "false-positive":
                        self.disable_track_filters = True
                        self.disable_background_subtraction = True
                        logging.info("Turning Track filters OFF.")

                    self.process_folder(
                        os.path.join(folder_root, folder), tag=folder.lower()
                    )

                    if folder.lower() == "false-positive":
                        logging.info("Restoring Track filters.")
                        self.disable_track_filters = previous_filter_setting
                        self.disable_background_subtraction = (
                            previous_background_setting
                        )

    def clean_tag(self, tag):
        """
        Removes all clips with given tag.
        :param tag: label to remove
        """
        logging.info("removing tag: %s", tag)

        ids = self.database.get_all_track_ids()
        for (clip_id, track_number) in ids:
            if not self.database.has_clip(clip_id):
                continue
            meta = self.database.get_track_meta(clip_id, track_number)
            if meta["tag"] == tag:
                logging.info("removing: %s", clip_id)
                self.database.remove_clip(clip_id)

    def clean_all(self):
        """
        Checks if there are any clips in the database that are on the banned list.  Also makes sure no track has more
        tracks than specified in hints file.
        """

        for clip_id, max_tracks in self.hints.items():
            if self.database.has_clip(clip_id):
                if max_tracks == 0:
                    logging.info(" - removing banned clip %s", clip_id)
                    self.database.remove_clip(clip_id)
                else:
                    meta = self.database.get_clip_meta(clip_id)
                    if meta["tracks"] > max_tracks:
                        logging.info(" - removing out of date clip: %s", clip_id)
                        self.database.remove_clip(clip_id)

    def process_file(self, full_path, **kwargs):
        """
        Extract tracks from specific file, and assign given tag.
        :param full_path: path: path to CPTV file to be processed
        :param tag: the tag to assign all tracks from this CPTV files
        :returns the tracker object
        """

        tag = kwargs["tag"]

        base_filename = os.path.splitext(os.path.split(full_path)[1])[0]
        cptv_filename = base_filename + ".cptv"

        logging.info(f"processing %s", cptv_filename)

        destination_folder = os.path.join(self.config.tracks_folder, tag.lower())
        os.makedirs(destination_folder, mode=0o775, exist_ok=True)
        # delete any previous files
        tools.purge(destination_folder, base_filename + "*.mp4")

        # read additional information from hints file
        if cptv_filename in self.hints:
            print(cptv_filename)
            logging.info(self.hints[cptv_filename])
            max_tracks = self.hints[cptv_filename]
            if max_tracks == 0:
                return
        else:
            max_tracks = self.config.tracking.max_tracks

        # load the track
        tracker = TrackExtractor(self.tracker_config)
        tracker.max_tracks = max_tracks
        tracker.tag = tag

        # by default we don't want to process the moving background images as it's too hard to get good tracks
        # without false-positives.
        tracker.reject_non_static_clips = True

        if self.disable_track_filters:
            tracker.track_min_delta = 0.0
            tracker.track_min_mass = 0.0
            tracker.track_min_offset = 0.0
            tracker.reject_non_static_clips = False

        if self.disable_background_subtraction:
            tracker.disable_background_subtraction = True

        # read metadata
        meta_data_filename = os.path.splitext(full_path)[0] + ".txt"
        if os.path.exists(meta_data_filename):

            meta_data = tools.load_clip_metadata(meta_data_filename)

            tags = set(
                [
                    tag["animal"]
                    for tag in meta_data["Tags"]
                    if "automatic" not in tag or not tag["automatic"]
                ]
            )

            # we can only handle one tagged animal at a time here.
            if len(tags) == 0:
                logging.warning(" - no tags in cptv files, ignoring.")
                return

            if len(tags) >= 2:
                # make sure all tags are the same
                logging.warning(" - mixed tags, can not process: %s", tags)
                return

            tracker.stats["confidence"] = meta_data["Tags"][0].get("confidence", 0.0)
            tracker.stats["trap"] = meta_data["Tags"][0].get("trap", "none")
            tracker.stats["event"] = meta_data["Tags"][0].get("event", "none")

            # clips tagged with false-positive sometimes come through with a null confidence rating
            # so we set it to 0.8 here.
            if (
                tracker.stats["event"] in ["false-positive", "false positive"]
                and tracker.stats["confidence"] is None
            ):
                tracker.stats["confidence"] = 0.8

            tracker.stats["cptv_metadata"] = meta_data
        else:
            self.log_warning(
                " - Warning: no tag metadata found for file - cannot use for machine learning."
            )

        start = time.time()

        # save some additional stats
        tracker.stats["version"] = TrackExtractor.VERSION

        tracker.load(full_path)

        if not tracker.extract_tracks():
            # this happens if the tracker rejected the video for some reason (i.e. too hot, or not static background).
            # we still need to make a record that we looked at it though.
            self.database.create_clip(os.path.basename(full_path), tracker)
            logging.warning(" - skipped (%s)", tracker.reject_reason)
            return tracker

        # assign each track the correct tag
        for track in tracker.tracks:
            track.tag = tag

        if self.enable_track_output:
            self.export_tracks(full_path, tracker, self.database)

        # write a preview
        if self.previewer:
            preview_filename = base_filename + "-preview" + ".mp4"
            preview_filename = os.path.join(destination_folder, preview_filename)
            self.previewer.create_individual_track_previews(preview_filename, tracker)
            self.previewer.export_clip_preview(preview_filename, tracker)

        if self.tracker_config.verbose:
            num_frames = len(tracker.frame_buffer.thermal)
            ms_per_frame = (time.time() - start) * 1000 / max(1, num_frames)
            self.log_message(
                "Tracks {}.  Frames: {}, Took {:.1f}ms per frame".format(
                    len(tracker.tracks), num_frames, ms_per_frame
                )
            )

        return tracker

    def export_tracks(
        self, full_path, tracks, tracker: TrackExtractor, database: TrackDatabase
    ):
        """
        Writes tracks to a track database.
        :param database: database to write track to.
        """

        clip_id = os.path.basename(full_path)

        # overwrite any old clips.
        # Note: we do this even if there are no tracks so there there will be a blank clip entry as a record
        # that we have processed it.
        database.create_clip(clip_id, tracker)

        if len(tracker.tracks) == 0:
            return

        tracker.generate_optical_flow()

        # get track data
        for track_number, track in enumerate(tracker.tracks):
            track_data = []
            for i in range(len(track)):
                channels = tracker.get_track_channels(track, i)

                # zero out the filtered channel
                if not self.config.extract.include_filtered_channel:
                    channels[TrackChannels.filtered] = 0
                track_data.append(channels)
            track_id = track_number + 1
            start_time, end_time = tracker.start_and_end_time_absolute(track)
            database.add_track(
                clip_id,
                track_id,
                track_data,
                track,
                opts=self.compression,
                start_time=start_time,
                end_time=end_time,
            )

    def needs_processing(self, source_filename):
        """
        Returns if given source file needs processing or not
        :param source_filename:
        :return:
        """

        clip_id = os.path.basename(source_filename)

        if self.config.reprocess:
            return True

        return not self.database.has_clip(clip_id)

    def run_test(self, source_folder, test: TrackerTestCase):
        """ Runs a specific test case. """

        def are_similar(value, expected, relative_error=0.2, abs_error=2.0):
            """ Checks of value is similar to expected value. An expected value of 0 will always return true. """
            if expected == 0:
                return True
            return ((abs(value - expected) / expected) <= relative_error) or (
                abs(value - expected) <= abs_error
            )

        # find the file.  We looking in all the tag folder to make life simpler when creating the test file.
        source_file = tools.find_file(source_folder, test.source)

        # make sure we don't write to database
        self.enable_track_output = False

        if source_file is None:
            logging.warning(
                "Could not find %s in root folder %s", test.source, source_folder
            )
            return

        logging.info(source_file)
        tracker = self.process_file(source_file, tag="test")

        # read in stats files and see how we did
        if len(tracker.tracks) != len(test.tracks):
            logging.error(
                "%s Incorrect number of tracks, expected %s found %s",
                test.source,
                len(test.tracks),
                len(tracker.tracks),
            )
            return

        for test_result, (expected_duration, expected_movement) in zip(
            tracker.tracks, test.tracks
        ):

            track_stats = test_result.get_stats()

            if not are_similar(
                test_result.duration, expected_duration
            ) or not are_similar(track_stats.max_offset, expected_movement):
                logging.error(
                    "%s Track too dissimilar expected %s but found %s",
                    test.source,
                    (expected_duration, expected_movement),
                    (test_result.duration, track_stats.max_offset),
                )
            else:
                logging.info("%s passed", test.source)

    def run_tests(self, source_folder, tests_file):
        """ Processes file in test file and compares results to expected output. """

        # disable hints for tests
        self.hints = []

        tests = []
        test = None

        # # we need to make sure all tests are redone every time.
        # self.overwrite_mode = self.OM_ALL

        # load in the test data
        for line in open(tests_file, "r"):
            line = line.strip()
            if line == "":
                continue
            if line[0] == "#":
                continue

            if line.split()[0].lower() == "track":
                if test == None:
                    raise Exception("Can not have track before source file.")
                expected_length, expected_movement = [int(x) for x in line.split()[1:]]
                test.tracks.append((expected_length, expected_movement))
            else:
                test = TrackerTestCase()
                test.source = line
                tests.append(test)

        logging.info("Found %d test cases", len(tests))

        for test in tests:
            self.run_test(source_folder, test)
示例#3
0
class CPTVTrackExtractor(CPTVFileProcessor):
    """
    Handles extracting tracks from CPTV files.
    Maintains a database recording which files have already been processed, and some statistics parameters used
    during processing.
    """

    # version number.  Recorded into stats file when a clip is processed.
    VERSION = 6

    def __init__(self, out_folder):

        CPTVFileProcessor.__init__(self)

        self.hints = {}
        self.colormap = plt.get_cmap('jet')
        self.verbose = False
        self.out_folder = out_folder
        self.overwrite_mode = CPTVTrackExtractor.OM_NONE
        self.enable_previews = False
        self.enable_track_output = True

        # normally poor quality tracks are filtered out, enabling this will let them through.
        self.disable_track_filters = False
        # disables background subtraction
        self.disable_background_subtraction = False

        self.high_quality_optical_flow = False

        self.database = TrackDatabase(os.path.join(self.out_folder, 'dataset.hdf5'))

        self.worker_pool_init = init_workers

    def load_hints(self, filename):
        """ Read in hints file from given path.  If file is not found an empty hints dictionary set."""

        self.hints = {}

        if not os.path.exists(filename):
            return

        f = open(filename)
        for line_number, line in enumerate(f):
            line = line.strip()
            # comments
            if line == '' or line[0] == '#':
                continue
            try:
                (filename, file_max_tracks) = line.split()[:2]
            except:
                raise Exception("Error on line {0}: {1}".format(line_number, line))
            self.hints[filename] = int(file_max_tracks)

    def process_all(self, root):

        previous_filter_setting = self.disable_track_filters
        previous_background_setting = self.disable_background_subtraction

        for root, folders, files in os.walk(root):
            for folder in folders:
                if folder not in EXCLUDED_FOLDERS:
                    if folder.lower() == "false-positive":
                        self.disable_track_filters = True
                        self.disable_background_subtraction = True
                        print("Turning Track filters OFF.")
                    self.process_folder(os.path.join(root, folder), tag=folder.lower(), worker_pool_args=(trackdatabase.hdf5_lock,))
                    if folder.lower() == "false-positive":
                        print("Restoring Track filters.")
                        self.disable_track_filters = previous_filter_setting
                        self.disable_background_subtraction = previous_background_setting



    def clean_tag(self, tag):
        """
        Removes all clips with given tag.
        :param tag: label to remove
        """
        print("removing tag {}".format(tag))

        ids = self.database.get_all_track_ids()
        for (clip_id, track_number) in ids:
            if not self.database.has_clip(clip_id):
                continue
            meta = self.database.get_track_meta(clip_id, track_number)
            if meta['tag'] == tag:
                print("removing", clip_id)
                self.database.remove_clip(clip_id)


    def clean_all(self):
        """
        Checks if there are any clips in the database that are on the banned list.  Also makes sure no track has more
        tracks than specified in hints file.
        """

        for clip_id, max_tracks in self.hints.items( ):
            if self.database.has_clip(clip_id):
                if max_tracks == 0:
                    print(" - removing banned clip {}".format(clip_id))
                    self.database.remove_clip(clip_id)
                else:
                    meta = self.database.get_clip_meta(clip_id)
                    if meta['tracks'] > max_tracks:
                        print(" - removing out of date clip {}".format(clip_id))
                        self.database.remove_clip(clip_id)


    def process_file(self, full_path, **kwargs):
        """
        Extract tracks from specific file, and assign given tag.
        :param full_path: path: path to CPTV file to be processed
        :param tag: the tag to assign all tracks from this CPTV files
        :param create_preview_file: if enabled creates an MPEG preview file showing the tracking working.  This
            process can be quite time consuming.
        :returns the tracker object
        """

        tag = kwargs['tag']

        base_filename = os.path.splitext(os.path.split(full_path)[1])[0]
        cptv_filename = base_filename + '.cptv'
        preview_filename = base_filename + '-preview' + '.mp4'
        stats_filename = base_filename + '.txt'

        destination_folder = os.path.join(self.out_folder, tag.lower())

        stats_path_and_filename = os.path.join(destination_folder, stats_filename)

        # read additional information from hints file
        if cptv_filename in self.hints:
            max_tracks = self.hints[cptv_filename]
            if max_tracks == 0:
                return
        else:
            max_tracks = 10

        # make destination folder if required
        try:
            os.stat(destination_folder)
        except:
            self.log_message(" Making path " + destination_folder)
            os.mkdir(destination_folder)

        # check if we have already processed this file
        if self.needs_processing(stats_path_and_filename):
            print("Processing {0} [{1}]".format(cptv_filename, tag))
        else:
            return

        # delete any previous files
        tools.purge(destination_folder, base_filename + "*.mp4")

        # load the track
        tracker = TrackExtractor()
        tracker.max_tracks = max_tracks
        tracker.tag = tag
        tracker.verbose = self.verbose >= 2
        tracker.high_quality_optical_flow = self.high_quality_optical_flow

        # by default we don't want to process the moving background images as it's too hard to get good tracks
        # without false-positives.
        tracker.reject_non_static_clips = True

        if self.disable_track_filters:
            tracker.track_min_delta = 0.0
            tracker.track_min_mass = 0.0
            tracker.track_min_offset = 0.0
            tracker.reject_non_static_clips = False

        if self.disable_background_subtraction:
            tracker.disable_background_subtraction = True

        # read metadata
        meta_data_filename = os.path.splitext(full_path)[0] + ".txt"
        if os.path.exists(meta_data_filename):

            meta_data = tools.load_clip_metadata(meta_data_filename)

            tags = set([tag['animal'] for tag in meta_data['Tags'] if 'automatic' not in tag or not tag['automatic']])

            # we can only handle one tagged animal at a time here.
            if len(tags) == 0:
                print(" - Warning, no tags in cptv files, ignoring.")
                return

            if len(tags)>= 2:
                # make sure all tags are the same
                print(" - Warning, mixed tags, can not process.",tags)
                return

            tracker.stats['confidence'] = meta_data['Tags'][0].get('confidence',0.0)
            tracker.stats['trap'] = meta_data['Tags'][0].get('trap','none')
            tracker.stats['event'] = meta_data['Tags'][0].get('event','none')

            # clips tagged with false-positive sometimes come through with a null confidence rating
            # so we set it to 0.8 here.
            if tracker.stats['event'] in ['false-positive', 'false positive'] and tracker.stats['confidence'] is None:
                tracker.stats['confidence'] = 0.8

            tracker.stats['cptv_metadata'] = meta_data
        else:
            self.log_warning(" - Warning: no metadata found for file.")
            return

        start = time.time()

        # save some additional stats
        tracker.stats['version'] = CPTVTrackExtractor.VERSION

        tracker.load(full_path)

        if not tracker.extract_tracks():
            # this happens if the tracker rejected the video for some reason (i.e. too hot, or not static background).
            # we still need to make a record that we looked at it though.
            self.database.create_clip(os.path.basename(full_path), tracker)
            print(" - skipped ({})".format(tracker.reject_reason))
            return tracker

        # assign each track the correct tag
        for track in tracker.tracks:
            track.tag = tag

        if self.enable_track_output:
            tracker.export_tracks(self.database)

        # write a preview
        if self.enable_previews:
            self.export_mpeg_preview(os.path.join(destination_folder, preview_filename), tracker)

        time_per_frame = (time.time() - start) / len(tracker.frame_buffer)

        # time_stats = tracker.stats['time_per_frame']
        self.log_message(" -tracks: {} {:.1f}sec - Time per frame: {:.1f}ms".format(
             len(tracker.tracks),
             sum(track.duration for track in tracker.tracks),
             time_per_frame * 1000
         ))

        return tracker

    def needs_processing(self, source_filename):
        """
        Returns if given source file needs processing or not
        :param source_filename:
        :return:
        """

        clip_id = os.path.basename(source_filename)

        if self.overwrite_mode == self.OM_ALL:
            return True

        return not self.database.has_clip(clip_id)

    def run_test(self, source_folder, test: TrackerTestCase):
        """ Runs a specific test case. """

        def are_similar(value, expected, relative_error = 0.2, abs_error = 2.0):
            """ Checks of value is similar to expected value. An expected value of 0 will always return true. """
            if expected == 0:
                return True
            return ((abs(value - expected) / expected) <= relative_error) or (abs(value - expected) <= abs_error)

        # find the file.  We looking in all the tag folder to make life simpler when creating the test file.
        source_file = tools.find_file(source_folder, test.source)

        # make sure we don't write to database
        self.enable_track_output = False

        if source_file is None:
            print("Could not find {0} in root folder {1}".format(test.source, source_folder))
            return

        print(source_file)
        tracker = self.process_file(source_file, tag='test')

        # read in stats files and see how we did
        if len(tracker.tracks) != len(test.tracks):
            print("[Fail] {0} Incorrect number of tracks, expected {1} found {2}".format(test.source, len(test.tracks), len(tracker.tracks)))
            return

        for test_result, (expected_duration, expected_movement) in zip(tracker.tracks, test.tracks):

            track_stats = test_result.get_stats()

            if not are_similar(test_result.duration, expected_duration) or not are_similar(track_stats.max_offset, expected_movement):
                print("[Fail] {0} Track too dissimilar expected {1} but found {2}".format(
                    test.source,
                    (expected_duration, expected_movement),
                    (test_result.duration, track_stats.max_offset)))
            else:
                print("[PASS] {0}".format(test.source))

    def export_track_mpeg_previews(self, filename_base, tracker: TrackExtractor):
        """
        Exports preview MPEG for a specific track
        :param filename_base:
        :param tracker:
        :param track:
        :return:
        """

        # resolution of video file.
        # videos look much better scaled up
        FRAME_SIZE = 4*48

        frame_width, frame_height = FRAME_SIZE, FRAME_SIZE
        frame_width =  frame_width // 4 * 4
        frame_height = frame_height // 4 * 4

        for id, track in enumerate(tracker.tracks):
            video_frames = []
            for frame_number in range(len(track.bounds_history)):
                channels = tracker.get_track_channels(track, frame_number)
                img = tools.convert_heat_to_img(channels[1], self.colormap, 0, 350)
                img = img.resize((frame_width, frame_height), Image.NEAREST)
                video_frames.append(np.asarray(img))

            tools.write_mpeg(filename_base+"-"+str(id+1)+".mp4", video_frames)

    def export_mpeg_preview(self, filename, tracker: TrackExtractor):
        """
        Exports tracking information preview to MPEG file.
        """

        self.export_track_mpeg_previews(os.path.splitext(filename)[0], tracker)

        MPEGStreamer = MPEGPreviewStreamer(tracker, self.colormap)

        tools.stream_mpeg(filename, MPEGStreamer)

    def run_tests(self, source_folder, tests_file):
        """ Processes file in test file and compares results to expected output. """

        # disable hints for tests
        self.hints = []

        tests = []
        test = None

        # we need to make sure all tests are redone every time.
        self.overwrite_mode = CPTVTrackExtractor.OM_ALL

        # load in the test data
        for line in open(tests_file, 'r'):
            line = line.strip()
            if line == '':
                continue
            if line[0] == '#':
                continue

            if line.split()[0].lower() == 'track':
                if test == None:
                    raise Exception("Can not have track before source file.")
                expected_length, expected_movement = [int(x) for x in line.split()[1:]]
                test.tracks.append((expected_length, expected_movement))
            else:
                test = TrackerTestCase()
                test.source = line
                tests.append(test)

        print("Found {0} test cases".format(len(tests)))

        for test in tests:
            self.run_test(source_folder, test)
示例#4
0
class Dataset:
    """
    Stores visit, clip, track, and segment information headers in memory, and allows track / segment streaming from
    disk.
    """

    # Number of threads to use for async loading
    WORKER_THREADS = 2

    # If true uses processes instead of threads.  Threads do not scale as well due to the GIL, however there is no
    # transfer time required per segment.  Processes scale much better but require ~1ms to pickling the segments
    # across processes.
    # In general if worker threads is one set this to False, if it is two or more set it to True.
    PROCESS_BASED = True

    # number of pixels to inset from frame edges by default
    DEFAULT_INSET = 2

    def __init__(
        self,
        db_file,
        name="Dataset",
        config=None,
        use_segments=True,
        use_predictions=False,
        consecutive_segments=False,
        labels=[],
    ):
        self.consecutive_segments = consecutive_segments
        # self.camera_bins = {}
        self.use_segments = use_segments
        # database holding track data
        self.db_file = db_file
        self.db = None
        self.load_db()
        self.label_mapping = None
        # name of this dataset
        self.name = name
        # list of our tracks
        self.tracks = []
        self.tracks_by_label = {}
        self.tracks_by_bin = {}
        self.tracks_by_id = {}
        self.camera_names = set()
        # self.cameras_by_id = {}

        # cumulative distribution function for segments.  Allows for super fast weighted random sampling.
        self.segment_cdf = []
        self.segment_label_cdf = {}
        # segments list
        self.segments = []
        self.segments_by_label = {}
        self.segments_by_id = {}

        self.frame_cdf = []
        self.frame_label_cdf = {}

        self.frame_samples = []
        self.frames_by_label = {}
        self.frames_by_id = {}

        # list of label names
        self.labels = labels
        self.label_mapping = None

        self.enable_augmentation = False
        self.label_caps = {}

        if config:
            self.segment_length = config.build.segment_length
            # number of seconds segments are spaced apart
            self.segment_spacing = config.build.segment_spacing
            self.banned_clips = config.build.banned_clips
            self.included_labels = config.labels
            self.segment_min_avg_mass = config.build.segment_min_avg_mass
        else:
            # number of seconds each segment should be
            self.segment_length = 25
            # number of seconds segments are spaced apart
            self.segment_spacing = 1
            self.segment_min_avg_mass = None
        self.filtered_stats = {
            "confidence": 0,
            "trap": 0,
            "banned": 0,
            "date": 0,
            "tags": 0,
            "segment_mass": 0,
            "no_data": 0,
        }
        self.lbl_p = None
        self.numpy_data = None

    # is much faster to read from numpy array when trianing
    def saveto_numpy(self, path):
        file = os.path.join(path, self.name)
        self.numpy_data = NumpyMeta(f"{file}.npy")
        self.numpy_data.save_tracks(self.db, self.tracks)
        self.numpy_data.f = None

    def clear_tracks(self):
        del self.tracks
        del self.tracks_by_label
        del self.tracks_by_bin
        del self.tracks_by_id

    def load_db(self):
        self.db = TrackDatabase(self.db_file)

    def clear_samples(self):
        self.frame_cdf = []
        self.frame_label_cdf = {}

        self.frame_samples = []
        self.frames_by_label = {}
        self.frames_by_id = {}
        self.segment_cdf = []
        self.segment_label_cdf = {}
        # segments list
        self.segments = []
        self.segments_by_label = {}
        self.segments_by_id = {}
        for track in self.tracks:
            track.segments = None
            track.sample_frames = None
        gc.collect()

    def clear_unused(self):
        if self.use_segments:
            self.frame_cdf = []
            self.frame_label_cdf = {}

            self.frame_samples = []
            self.frames_by_label = {}
            self.frames_by_id = {}
        else:
            self.segment_cdf = []
            self.segment_label_cdf = {}
            # segments list
            self.segments = []
            self.segments_by_label = {}
            self.segments_by_id = {}
        gc.collect()

    def set_read_only(self, read_only):
        if self.db is not None:
            self.db.set_read_only(read_only)

    def highest_mass_only(self):
        # top_frames for i3d generates all segments above a  min average mass
        # use this to take only the best
        remove = [
            segment for segment in self.segments if not segment.best_mass
        ]

        for segment in remove:
            segment.track.segments.remove(segment)
            self.segments_by_label[segment.label].remove(segment)
            del self.segments_by_id[segment.id]
        self.segments = [
            segment for segment in self.segments if segment.best_mass
        ]

        self.rebuild_cdf()

    @property
    def sample_count(self):
        return len(self.samples())

    def samples(self):
        if self.use_segments:
            return self.segments
        return self.frame_samples

    def set_samples(self, samples):
        if self.use_segments:
            self.segments = samples
        else:
            self.frame_samples = samples

    def set_samples_for(self, label, samples):
        if self.use_segments:
            self.segments_by_label[label] = samples
        else:
            self.frames_by_label[label] = samples

    def get_label_caps(self, labels, remapped=False):
        counts = []
        for label in labels:
            counts.append(len(self.samples_for(label, remapped=remapped)))
        index = math.floor(len(counts) * 0.40)
        counts.sort()
        birds = self.samples_for("bird", remapped=remapped)
        if len(birds) > 0:
            return len(birds)
        # return 4096

        return counts[index]
        # return int(np.percentile(counts, 25))

    def samples_for(self, label, remapped=False):
        labels = []
        if remapped and self.label_mapping:
            labels = [
                key for key, mapped in self.label_mapping.items()
                if mapped.lower() == label.lower()
            ]
            labels.sort()
        else:
            labels.append(label)
        samples = []
        for l in labels:
            if self.use_segments:
                samples.extend(self.segments_by_label.get(l, []))
            else:
                samples.extend(self.frames_by_label.get(l, []))
        return samples

    def get_counts(self, label):
        """
        Gets number of examples for given label
        :label: label to check
        :return: (segments, tracks, bins, weight)
        """
        segments = 0
        tracks = 0
        bins = 0
        weight = 0
        frames = 0
        if self.label_mapping:
            for key, value in self.label_mapping.items():
                if key == label or value == label:
                    label_tracks = self.tracks_by_label.get(key, [])
                    tracks += len(label_tracks)
                    segments += sum(
                        len(track.segments) for track in label_tracks)
                    frames += sum(
                        len(track.get_sample_frames())
                        for track in label_tracks
                        if track.sample_frames is not None)

        else:
            label_tracks = self.tracks_by_label.get(label, [])
            segments = sum(len(track.segments) for track in label_tracks)
            weight = self.get_label_weight(label)
            tracks = len(label_tracks)
            frames = sum(
                len(track.get_sample_frames()) for track in label_tracks
                if track.sample_frames is not None)
            bins = len([
                tracks for bin_name, tracks in self.tracks_by_bin.items()
                if len(tracks) > 0 and tracks[0].label == label
            ])
        return segments, frames, tracks, bins, weight

    def load_tracks(self, shuffle=False, before_date=None, after_date=None):
        """
        Loads track headers from track database with optional filter
        :return: [number of tracks added, total tracks].
        """
        counter = 0
        track_ids = self.db.get_all_track_ids(before_date=before_date,
                                              after_date=after_date)
        if shuffle:
            np.random.shuffle(track_ids)
        for clip_id, track_id in track_ids:
            if self.load_track(clip_id, track_id):
                counter += 1
            if counter % 50 == 0:
                logging.debug("Dataset loaded %s / %s", counter,
                              len(track_ids))
        return [counter, len(track_ids)]

    def add_tracks(self, tracks):
        """
        Adds list of tracks to dataset
        :param tracks: list of TrackHeader
        :param track_filter: optional filter
        """
        result = 0
        for track in tracks:
            if self.add_track_header(track):
                result += 1
        return result

    def add_track_header(self, track_header):
        if track_header.unique_id in self.tracks_by_id:
            return False

        self.tracks.append(track_header)
        self.add_track_to_mappings(track_header)
        self.segments.extend(track_header.segments)
        return True

    def load_track(self, clip_id, track_id):
        """
        Creates segments for track and adds them to the dataset
        :param clip_id: id of tracks clip
        :param track_id: track number
        :param track_filter: if provided a function filter(clip_meta, track_meta) that returns true when a track should
                be ignored)
        :return: True if track was added, false if it was filtered out.
        :return:
        """
        # make sure we don't already have this track
        if "{}-{}".format(clip_id, track_id) in self.tracks_by_bin:
            return False
        clip_meta = self.db.get_clip_meta(clip_id)
        track_meta = self.db.get_track_meta(clip_id, track_id)
        if self.filter_track(clip_meta, track_meta):
            return False
        track_header = TrackHeader.from_meta(clip_id, clip_meta, track_meta)
        self.tracks.append(track_header)

        segment_frame_spacing = int(
            round(self.segment_spacing * track_header.frames_per_second))
        segment_width = self.segment_length

        track_header.calculate_segments(
            segment_frame_spacing,
            segment_width,
            self.segment_min_avg_mass,
        )
        self.filtered_stats["segment_mass"] += track_header.filtered_stats[
            "segment_mass"]

        self.segments.extend(track_header.segments)
        self.add_track_to_mappings(track_header)

        return True

    def filter_track(self, clip_meta, track_meta):
        # some clips are banned for various reasons
        source = os.path.basename(clip_meta["filename"])
        if self.banned_clips and source in self.banned_clips:
            self.filtered_stats["banned"] += 1
            return True
        if "tag" not in track_meta:
            self.filtered_stats["tags"] += 1
            return True
        if track_meta["tag"] not in self.included_labels:
            self.filtered_stats["tags"] += 1
            return True

        # always let the false-positives through as we need them even though they would normally
        # be filtered out.
        if "bounds_history" not in track_meta or len(
                track_meta["bounds_history"]) == 0:
            self.filtered_stats["no_data"] += 1
            return True

        if track_meta["tag"] == "false-positive":
            return False

        # for some reason we get some records with a None confidence?
        if track_meta.get("confidence", 0.0) <= 0.6:
            self.filtered_stats["confidence"] += 1
            return True

        # remove tracks of trapped animals
        if ("trap" in clip_meta.get("event", "").lower()
                or "trap" in clip_meta.get("trap", "").lower()):
            self.filtered_stats["trap"] += 1
            return True

        return False

    def add_track_to_mappings(self, track_header):
        if self.label_mapping and track_header.label in self.label_mapping:
            track_header.label = self.mapped_label(track_header.label)

        self.tracks_by_id[track_header.unique_id] = track_header
        bins = self.tracks_by_bin.setdefault(track_header.bin_id, [])
        bins.append(track_header)

        if track_header.label not in self.labels:
            self.labels.append(track_header.label)
        if track_header.label not in self.tracks_by_label:
            self.tracks_by_label[track_header.label] = []
        self.tracks_by_label[track_header.label].append(track_header)
        segs = self.segments_by_label.setdefault(track_header.label, [])
        segs.extend(track_header.segments)
        for seg in segs:
            self.segments_by_id[seg.id] = seg
        frames = self.frames_by_label.setdefault(track_header.label, [])
        samples = track_header.get_sample_frames()
        for sample in samples:
            self.frames_by_id[sample.id] = sample
        self.frame_samples.extend(samples)
        frames.extend(samples)
        # camera = self.cameras_by_id.setdefault(
        #     track_header.camera_id, Camera(track_header.camera_id)
        # )
        self.camera_names.add(track_header.camera_id)
        # camera.add_track(track_header)

    def filter_segments(self, avg_mass, ignore_labels=None):
        """
        Removes any segments with an average mass less than the given avg_mass
        :param avg_mass: segments with less avarage mass per frame than this will be removed from the dataset.
        :param ignore_labels: these labels will not be filtered
        :return: number of segments removed
        """

        num_filtered = 0
        new_segments = []

        for segment in self.segments:

            pass_mass = segment.avg_mass >= avg_mass
            if (not ignore_labels
                    and segment.label in ignore_labels) or (pass_mass):
                new_segments.append(segment)
            else:
                num_filtered += 1

        self.segments = new_segments

        self._purge_track_segments()

        self.rebuild_cdf()

        return num_filtered

    def fetch_track(
        self,
        track: TrackHeader,
        original=False,
        sample_frames=False,
    ):
        """
        Fetches data for an entire track
        :param track: the track to fetch
        :return: segment data of shape [frames, channels, height, width]
        """
        frame_numbers = None
        if sample_frames:
            frame_numbers = [frame.frame_num for frame in track.sample_frames]
            frame_numbers.sort()
        frames = self.db.get_track(
            track.clip_id,
            track.track_id,
            original=original,
            frame_numbers=frame_numbers,
        )
        return frames

    def fetch_random_sample(self, sample, channel=None):
        sample_frames = sample.track.sample_frames
        np.random.shuffle(sample_frames)
        sample_frames = sample_frames[:sample.frames]
        sample_frames = [frame.frame_num for frame in sample_frames]
        sample_frames.sort()
        frames = self.db.get_track(
            sample.track.clip_id,
            sample.track.track_id,
            frame_numbers=sample_frames,
            channels=channel,
        )
        return frames

    def fetch_segment_data(self, sample, channel=None):

        frames = self.db.get_track(
            sample.track.clip_id,
            sample.track.track_id,
            frame_numbers=sample.frame_indices,
            channels=TrackChannels.thermal,
        )
        background = self.db.get_clip_background(sample.track.clip_id)
        for frame in frames:
            region = sample.track.track_bounds[frame.frame_number]
            region = tools.Rectangle.from_ltrb(*region)
            cropped = region.subimage(background)
            frame.filtered = frame.thermal - cropped
        return frames

    def fetch_frame(self, frame_sample, channels=None):
        frame = self.db.get_frame(
            frame_sample.clip_id,
            frame_sample.track_id,
            frame_sample.frame_num,
        )

        return data

    def epoch_samples(self,
                      cap_samples=None,
                      replace=True,
                      random=True,
                      cap_at=None,
                      label_cap=None):
        if len(self.labels) == 0:
            return []
        labels = self.labels.copy()
        samples = []

        if (cap_at or cap_samples) and label_cap is None:
            if cap_at:
                label_cap = len(self.samples_for(cap_at, remapped=True))
            else:
                label_cap = self.get_label_caps(labels, remapped=True)

        cap = None
        for label in labels:
            if label_cap:
                cap = min(label_cap, len(self.samples_for(label,
                                                          remapped=True)))
            if label == "false-positive":
                if cap is None:
                    cap = int(label_cap * 0.5)
                else:
                    cap = min(cap, int(label_cap * 0.5))
            new = self.get_sample(cap=cap,
                                  replace=replace,
                                  label=label,
                                  random=random)
            if new is not None and len(new) > 0:
                samples.extend(new)
        labels = [sample.label for sample in samples]
        return samples

    def cdf(self):
        if self.use_segments:
            return self.segment_cdf
        return self.frame_cdf

    def label_cdf(self, label):
        if self.use_segments:
            return self.segment_label_cdf.get(label, [])
        return self.frame_label_cdf.get(label, [])

    def get_sample(self, cap=None, replace=True, label=None, random=True):
        """Returns a random frames from weighted list."""
        if label:
            samples = self.samples_for(label, remapped=True)
            cdf = self.label_cdf(label)
        else:
            samples = self.samples()
            cdf = self.cdf()
        if not samples:
            return None
        if cap is None:
            return samples
        if random:
            return np.random.choice(samples, cap, replace=replace, p=cdf)
        else:
            cap = min(cap, len(samples))
            return samples[:cap]

    def balance_bins(self, max_bin_weight=None):
        """
        Adjusts weights so that bins with a number number of segments aren't sampled so frequently.
        :param max_bin_weight: bins with more weight than this number will be scaled back to this weight.
        """

        for bin_name, tracks in self.tracks_by_bin.items():
            bin_weight = sum(track.weight for track in tracks)
            if bin_weight == 0:
                continue
            if max_bin_weight is None:
                scale_factor = 1 / bin_weight
                # means each bin has equal possiblity
            elif bin_weight > max_bin_weight:
                scale_factor = max_bin_weight / bin_weight
            else:
                scale_factor = 1
            for track in tracks:
                for segment in track.segments:
                    segment.weight = np.float16(segment.weight * scale_factor)
        self.rebuild_cdf()

    def remove_label(self, label_to_remove):
        """
        Removes all segments of given label from dataset. Label remains in dataset.labels however, so as to not
        change the ordinal value of the labels.
        """
        if label_to_remove not in self.labels:
            return
        tracks = self.tracks_by_label[label_to_remove]
        for track in tracks:
            self.remove_label()

        self.rebuild_cdf()

    def mapped_label(self, label):
        if self.label_mapping:
            return self.label_mapping.get(label, label)
        return label

    def rebuild_cdf(self, lbl_p=None):
        """Calculates the CDF used for fast random sampling for frames and
        segments, if balance labels is set each label has an equal chance of
        being chosen
        """
        if lbl_p is None:
            lbl_p = self.lbl_p
        self.rebuild_segment_cdf(lbl_p=lbl_p)
        self.rebuild_frame_cdf(lbl_p=lbl_p)

    def rebuild_frame_cdf(self, lbl_p=None):
        self.frame_cdf = []
        total = 0
        self.frame_label_cdf = {}

        for track in self.tracks:
            if track.sample_frames is None:
                continue
            for frame in track.sample_frames:
                frame_weight = track.frame_weight
                if lbl_p and track.label in lbl_p:
                    frame_weight *= lbl_p[track.label]
                total += frame_weight

                self.frame_cdf.append(frame_weight)

                cdf = self.frame_label_cdf.setdefault(track.label, [])
                cdf.append(track.frame_weight)

        if len(self.frame_cdf) > 0:
            self.frame_cdf = [x / total for x in self.frame_cdf]

        for key, cdf in self.frame_label_cdf.items():
            total = sum(cdf)
            self.frame_label_cdf[key] = [x / total for x in cdf]

        if self.label_mapping:
            mapped_cdf = {}
            labels = list(self.label_mapping.keys())
            labels.sort()
            for label in labels:
                if label not in self.frame_label_cdf:
                    continue
                label_cdf = self.frame_label_cdf[label]
                new_label = self.label_mapping[label]
                if lbl_p and label in lbl_p:
                    label_cdf = np.float64(label_cdf)
                    label_cdf *= lbl_p[label]
                cdf = mapped_cdf.setdefault(new_label, [])
                cdf.extend(label_cdf)

            for key, cdf in mapped_cdf.items():
                total = sum(cdf)
                mapped_cdf[key] = [x / total for x in cdf]
            self.frame_label_cdf = mapped_cdf

    def rebuild_segment_cdf(self, lbl_p=None):
        """Calculates the CDF used for fast random sampling"""
        self.segment_cdf = []
        total = 0
        self.segment_label_cdf = {}
        for segment in self.segments:
            seg_weight = segment.weight
            if lbl_p and segment.label in lbl_p:
                seg_weight *= lbl_p[segment.label]
            total += seg_weight
            self.segment_cdf.append(seg_weight)

        # guarantee it's in the order we will sample by
        for label, segments in self.segments_by_label.items():
            cdf = self.segment_label_cdf.setdefault(label, [])
            for segment in segments:
                cdf.append(segment.weight)

        if len(self.segment_cdf) > 0:
            self.segment_cdf = [x / total for x in self.segment_cdf]
        for key, cdf in self.segment_label_cdf.items():
            total = sum(cdf)
            if total > 0:
                self.segment_label_cdf[key] = [x / total for x in cdf]
            else:
                self.segment_label_cdf[key] = []
        # do this after so labels are balanced
        if self.label_mapping:
            mapped_cdf = {}
            labels = list(self.label_mapping.keys())
            labels.sort()
            for label in labels:
                if label not in self.segment_label_cdf:
                    continue
                label_cdf = self.segment_label_cdf[label]
                new_label = self.label_mapping[label]

                if lbl_p and label in lbl_p:
                    label_cdf = np.float64(label_cdf)
                    label_cdf *= lbl_p[label]
                cdf = mapped_cdf.setdefault(new_label, [])
                cdf.extend(label_cdf)

            for key, cdf in mapped_cdf.items():
                total = sum(cdf)

                mapped_cdf[key] = [x / total for x in cdf]
            self.segment_label_cdf = mapped_cdf

    def get_label_weight(self, label):
        """Returns the total weight for all segments of given label."""
        tracks = self.tracks_by_label.get(label)
        return sum(track.weight for track in tracks) if tracks else 0

    def regroup(
        self,
        groups,
        shuffle=True,
    ):
        """
        regroups the dataset so multiple animals can be under a single label
        """
        self.label_mapping = {}
        counts = []
        tracks_by_bin = {}
        samples = []
        for mapped_label, labels in groups.items():
            count = 0
            for label in labels:
                lbl_samples = self.samples_for(label)
                count += len(lbl_samples)
                samples.extend(lbl_samples)
                self.label_mapping[label] = mapped_label
                for sample in lbl_samples:
                    track = self.tracks_by_id[sample.unique_track_id]
                    tracks_by_bin[track.bin_id] = track
            counts.append(count)

        self.labels = list(groups.keys())
        self.labels.sort()
        self.tracks_by_bin = tracks_by_bin
        self.set_samples(samples)
        if self.use_segments:
            self.segments_by_id == {}
            for seg in samples:
                self.segments_by_id[seg.id] = seg

            if shuffle:
                np.random.shuffle(self.segments)
        elif shuffle:
            self.frames_by_id == {}
            for sample in samples:
                self.frames_by_id[sample.id] = sample
            np.random.shuffle(self.frame_samples)
        self.rebuild_cdf()

    def has_data(self):
        if self.use_segments:
            return len(self.segments) > 0
        else:
            return len(self.frame_samples) > 0

    def recalculate_segments(self, segment_type=SegmentType.ALL_RANDOM):
        self.segments_by_id.clear()
        self.segments_by_label.clear()
        del self.segments[:]
        del self.segments
        self.segments = []
        self.segments_by_label = {}
        self.segments_by_id = {}
        logging.info("%s generating segments  type %s", self.name,
                     segment_type)
        start = time.time()
        empty_tracks = []
        filtered_stats = 0

        for track in self.tracks:
            segment_frame_spacing = int(
                round(self.segment_spacing * track.frames_per_second))
            segment_width = self.segment_length
            use_important = True
            random_frames = True
            top_frames = False
            random_sections = False
            segment_min_avg_mass = self.segment_min_avg_mass
            if segment_type == SegmentType.IMPORTANT_RANDOM:
                use_important = True
                random_frames = True
                segment_min_avg_mass = self.segment_min_avg_mass
            elif segment_type == SegmentType.ALL_RANDOM:
                use_important = False
                random_frames = True
                segment_min_avg_mass = self.segment_min_avg_mass
            elif segment_type == SegmentType.IMPORTANT_SEQUENTIAL:
                use_important = True
                random_frames = False
            elif segment_type == SegmentType.ALL_SEQUENTIAL:
                use_important = False
                random_frames = False
                segment_min_avg_mass = self.segment_min_avg_mass
            elif segment_type == SegmentType.TOP_SEQUENTIAL:
                random_frames = False
                top_frames = True
            elif segment_type == SegmentType.ALL_RANDOM_SECTIONS:
                use_important = False
                random_frames = True
                segment_min_avg_mass = self.segment_min_avg_mass
                random_sections = True
            elif segment_type == SegmentType.ALL_RANDOM_NOMIN:
                use_important = False
                random_frames = False
                segment_min_avg_mass = None
            elif segment_type == SegmentType.TOP_RANDOM:
                use_important = False
                random_frames = True
                top_frames = True
            track.calculate_segments(
                segment_frame_spacing,
                segment_width,
                random_frames=random_frames,
                use_important=use_important,
                top_frames=top_frames,
                segment_min_mass=segment_min_avg_mass,
                random_sections=random_sections,
            )
            filtered_stats = filtered_stats + track.filtered_stats[
                "segment_mass"]
            if len(track.segments) == 0:
                empty_tracks.append(track)
                continue
            for seg in track.segments:
                self.segments_by_id[seg.id] = seg

            self.segments.extend(track.segments)
            segs = self.segments_by_label.setdefault(track.label, [])
            segs.extend(track.segments)
        # for track in empty_tracks:
        #     self.remove_track(track)

        self.rebuild_cdf()
        logging.info(
            "%s #segments %s filtered stats are %s took  %s",
            self.name,
            len(self.segments),
            filtered_stats,
            time.time() - start,
        )

    def remove_track(self, track):
        self.tracks.remove(track)
        del self.tracks_by_id[track.unique_id]
        if track.bin_id in self.tracks_by_bin:
            del self.tracks_by_bin[track.bin_id]
        self.tracks_by_label[track.label].remove(track)