示例#1
0
    def test_compute_worker_assignments_one_spectro_done(self):

        # Scenario: one spectro was already done:
        with tempfile.TemporaryDirectory(dir='/tmp', 
                                         prefix='test_spectro') as dst_dir:
            
            # Fake-create an existing spectrogram: 
            os.mkdir(os.path.join(dst_dir, 'HENLES_S'))
            done_spectro_path = os.path.join(dst_dir, 
                                             'HENLES_S/SONG_Henicorhinaleucosticta_xc259378.png')
            Path(done_spectro_path).touch()
            
            num_tasks_done = len(Utils.find_in_dir_tree(
                dst_dir,
                pattern='*.png', 
                entry_type='file'))

            true_num_assignments = self.num_sound_files - num_tasks_done

            self.verify_worker_assignments(self.sound_root,
                                           dst_dir, 
                                           WhenAlreadyDone.SKIP, 
                                           true_num_assignments)
            
            # We are to overwrite existing files, 
            # all sound files will need to be done:
            
            true_num_assignments = self.num_sound_files
            self.verify_worker_assignments(self.sound_root,
                                           dst_dir, 
                                           WhenAlreadyDone.OVERWRITE, 
                                           true_num_assignments)
示例#2
0
    def setUpClass(cls):
        cls.cur_dir     = os.path.dirname(__file__)
        cls.sound_root  = os.path.join(cls.cur_dir, 'sound_data')
        # Number of cores to use:
        num_cores = mp.cpu_count()
        cls.num_workers = round(num_cores * Utils.MAX_PERC_OF_CORES_TO_USE  / 100)
        
        cls.num_sound_files = len(Utils.find_in_dir_tree(
            cls.sound_root, 
            pattern='*.mp3', 
            entry_type='file'))
        
        cls.assignments = np.array(
		     [[('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259380.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259381.mp3')
		      ],
		     [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259383.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259379.mp3')
		      ],
		     [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259378.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259384.mp3')
		      ],
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc513.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc518466.mp3')],
		    
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc531750.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc50519.mp3')],
		    
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc511477.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc548015.mp3')
              ]
             ])        
示例#3
0
    def sign_of_life(cls,
                     job,
                     num_already_present_imgs,
                     outdir,
                     start_time,
                     force_rewrite=False):

        # Time for sign of life?
        now_time = datetime.datetime.now()
        time_duration = now_time - start_time
        # Every 3 seconds, but at least 3:
        if force_rewrite \
           or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0):

            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(time_duration,
                                                    granularity=4)

            # Get current and new spectro imgs in outdir:
            num_now_present_imgs = len(
                Utils.find_in_dir_tree(outdir, pattern="*.png"))
            num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs

            # Keep printing number of done snippets in the same
            # terminal line:
            print((f"{job.name}---Number of spectros: {num_now_present_imgs} "
                   f"({num_newly_present_imgs} new) after {duration_str}"),
                  end='\r')
            return num_newly_present_imgs
        else:
            return num_already_present_imgs
    def cull(self, dir_root, fextension):
        
        # Good recording code ranges:
        good_rec_id_rngs = [range(50000, 70001),
                            range(170000, 180001),
                            ] 
        
        # Get all audio file paths relative
        # to dir_root:
        
        pattern = f'*{fextension}' 
        wav_paths = Utils.find_in_dir_tree(dir_root, 
                                           pattern=pattern)
                                           
        
        #*********
        # wav_paths = ['/foo/bar/AM01_20190711_049999.wav', # no
        #   		   '/foo/bar/AM01_20190711_050000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_070000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_070001.wav', # no
        #   		   '/foo/bar/AM01_20190711_169999.wav', # no
        #   		   '/foo/bar/AM01_20190711_170000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_170001.wav', # no
        # ]
        # #********* 
        # Get just the filename without parents
        # and extension:
        to_delete = []
        for aud_path in wav_paths:
            ser_num = self.extract_ser_num(aud_path)
            if   ser_num in good_rec_id_rngs[0] \
              or ser_num in good_rec_id_rngs[1]:
                continue
            else:
                to_delete.append(aud_path)

        print(f"Examined {len(wav_paths)} {pattern} files...")
        if len(to_delete) > 0:
            
            if Utils.user_confirm(f"List the {len(to_delete)} bad files? (n/Y)", default='Y'):
                for fpath in to_delete:
                    print(f"{os.path.getsize(fpath)} bytes: {fpath}")

            if Utils.user_confirm(f"Delete {len(to_delete)} aud files? (N/y):", default='N'):
                num_deleted = 0
                for fname in to_delete:
                    try:
                        os.remove(fname)
                    except Exception as e:
                        print(f"Could not delete {fname}: {repr(e)}")
                    else:
                        num_deleted += 1
                        
                print(f"Removed {num_deleted} files")
            else:
                print('Canceling')
        else:
            print('No files are out of good recorder serial number ranges')
示例#5
0
    def chop_all(self):
        '''
        Workhorse: Assuming self.in_dir is root of all
        species audio samples:
        
                        self.in_dir
                        
          Species1        Species2   ...     Speciesn
           smpl1_1.mp3      smpl2_1.mp3         smpln_1.mp3
           smpl1_2.mp3      smpl2_2.mp3         smpln_2mp3
                            ...
                            
        Chops each .mp3 (or .wav) file into window_len snippets.
        Saves those snippets in a new directory. Creates a spectrogram 
        for each snippet, and saves those in a different, new directory.
        
        Resulting directories under self.out_dir will be:
         
                         self.out_dir
            spectrograms               wav-files
            
        If self.specific_species is None, audio files under all
        species are chopped. Else, self.specific_species is 
        expected to be a list of species names that correspond
        to the names of species directories above: Species1, Species2, etc.
        
        Returns a 2-tuple: (number of created .wav audio snippet files,
                            number of created .png spectrogram snippet files,
        
        '''
        for species in self.species_list:
            audio_files = os.listdir(os.path.join(self.in_dir, species))
            num_files   = len(audio_files)
            for i, sample_name in enumerate(audio_files):
                # Chop one audio file:
                self.log.info(f"Chopping {species} audio {i}/{num_files}")
                self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir)
            self.num_chopped += num_files

        num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png')
        num_audios   = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav')
        return (num_audios, num_spectros)
示例#6
0
    def setUpClass(cls):
        super(TestChopSpectrograms, cls).setUpClass()
        
        cls.skip_size = 2 # sec
        
        cls.cur_dir  = os.path.dirname(__file__)
        cls.spectro_root = os.path.join(cls.cur_dir, 
                                       'spectro_data_long')
        cls.spectro_file = os.path.join(cls.spectro_root, 'DOVE/dove_long.png')
        
        cls.num_spectro_files = len(Utils.find_in_dir_tree(
            cls.spectro_root, 
            pattern='*.png', 
            entry_type='file'))

        _spectro, metadata = SoundProcessor.load_spectrogram(cls.spectro_file)
        try:
            cls.duration      = float(metadata['duration'])
        except KeyError:
            raise AssertionError(f"Spectrogram test file {os.path.basename(cls.spectro_file)} has no duration metadata")
        
        cls.default_win_len = 5 # seconds
示例#7
0
    def test_from_commandline(self):
        with tempfile.TemporaryDirectory(dir='/tmp', 
                                         prefix='test_spectro') as dst_dir:
            
            args = Arguments()
            args.input   = self.spectro_root
            args.outdir  = dst_dir
            args.workers = None
            
            # Number of spectrogram .png files
            # in source tree:
            spectros_to_chop = Utils.find_in_dir_tree(self.spectro_root, '*.png')

            manager = mp.Manager()
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()

            # ------ Chop spectrograms:
            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.OVERWRITE
                )
                
            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]
            
            num_spectros_done = sum([len(Utils.find_in_dir_tree(one_filled_dir, '*.png'))
                                     for one_filled_dir
                                     in dirs_filled])

            self.assertTrue(num_spectros_done > len(spectros_to_chop))
            
            self.check_spectro_sanity(dirs_filled)
            
            # Remember the creation times:
            file_times = self.record_creation_times(dirs_filled)

            # ------ SKIP the existing spectrograms:
            # Run again, asking to skip already existing
            # spectros:
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()
            
            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.SKIP
                )

            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]

            # Mod times of png files must NOT have changed,
            # b/c of skipping
            new_file_times = self.record_creation_times(dirs_filled)
            self.assertDictEqual(new_file_times, file_times)
            
            # ------ Force RECREATION of spectrograms:
            # Run again with OVERWRITE, forcing the 
            # spectros to be done again:
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()

            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.OVERWRITE
                )
                
            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]
                           
            self.check_spectro_sanity(dirs_filled)
            
            # File times must be *different* from previous
            # run because we asked to overwrite:

            new_file_times = self.record_creation_times(dirs_filled)
            for fname in file_times.keys():
                try:
                    self.assertTrue(new_file_times[fname] != file_times[fname])
                except KeyError as e:
                    print(repr(e))
示例#8
0
    def run_workers(cls,
                    args,
                    global_info,
                    overwrite_policy=WhenAlreadyDone.ASK):
        '''
        Called by main to run the SpectrogramChopper in
        multiple processes at once. Partitions the
        audio files to be processed; runs the chopping
        while giving visual progress on terminal.
        
        Prints success/failure of each worker. Then
        returns. In order to avoid processes repeatedly
        reporting the same, or only locally kept info,
        the globally visible dict `global_info` is passed in.
        
        This method will add these key/val pairs:
        
           1 The total number of spectros to chop (key 'num_tasks')
           2 The number of already created snippets (key 'num_snips')
           3 A list with values False for each job, indicating
               that the corresponding job is not yet done (key 'jobs_status')

        Processes will update 2 and 3 as they report progress: 

        :param args: all arguments provided to argparse
        :type args: {str : Any}
        :param global_info: interprocess communication
            dict for reporting progress
        :type global_info: multiprocessing.manager.dict
        '''

        # Get a list of lists of species names
        # to process. The list is computed such
        # that each worker has roughly the same
        # number of recordings to chop. We let
        # the method determine the number of workers
        # by using 80% of the available cores.

        (worker_assignments,
         num_workers) = SpectrogramChopper.compute_worker_assignments(
             args.input, args.outdir, num_workers=args.workers)

        print(f"Distributing workload across {num_workers} workers.")

        # Initialize the dict with shared information:

        # Fill the inter-process list with False.
        # Will be used to logging jobs finishing
        # many times to the console (i.e. not used
        # for functions other than reporting progress:

        for _i in range(num_workers):
            # NOTE: reportedly one cannot just set the passed-in
            #       list to [False]*num_workers, b/c
            #       a regular python list won't be
            #       seen by other processes, even if
            #       embedded in a multiprocessing.manager.list
            #       instance:
            global_info['jobs_status'].append(False)

        # Number of full spectrograms to chop:
        global_info['snips_to_do'] = len(
            Utils.find_in_dir_tree(args.input, pattern="*.png"))

        # For progress reports, get number of already
        # existing .png files in out directory:
        global_info['snips_done'] = len(
            Utils.find_in_dir_tree(args.outdir, pattern="*.png"))

        # Assign each list of species to one worker:

        chopping_jobs = []
        for ass_num, assignment in enumerate(worker_assignments):
            chopper = SpectrogramChopper(args.input,
                                         args.outdir,
                                         overwrite_policy=overwrite_policy)
            ret_value_slot = mp.Value("b", False)
            job = ProcessWithoutWarnings(
                target=chopper.chop_from_file_list,
                args=(
                    assignment,
                    args.input,
                    args.outdir,
                    global_info,  # ***NEW
                    overwrite_policy,
                    ret_value_slot),
                name=f"ass# {ass_num}")
            job.ret_val = ret_value_slot

            chopping_jobs.append(job)
            print(f"Starting chops for {job.name}")
            job.start()

        start_time = datetime.datetime.now()

        # Keep checking on each job, until
        # all are done as indicated by all jobs_done
        # values being True, a.k.a valued 1:

        while sum(global_info['jobs_status']) < num_workers:
            for job_idx, job in enumerate(chopping_jobs):
                # Timeout 1 sec
                job.join(1)
                if job.exitcode is not None:
                    if global_info['jobs_status'][job_idx]:
                        # One of the processes has already
                        # reported this job as done. Don't
                        # report it again:
                        continue

                    # Let other processes know that this job
                    # is done, and they don't need to report
                    # that fact: we'll do it here below:
                    global_info['jobs_status'][job_idx] = True

                    # This job finished, and that fact has not
                    # been logged yet to the console:

                    res = "OK" if job.ret_val else "Error"
                    # New line after the single-line progress msgs:
                    print("")
                    print(
                        f"Worker {job.name}/{num_workers} finished with: {res}"
                    )
                    global_info['snips_done'] = cls.sign_of_life(
                        job,
                        global_info['snips_done'],
                        args.outdir,
                        start_time,
                        force_rewrite=True)
                    # Check on next job:
                    continue

                # This job not finished yet.
                # Time for sign of life?
                global_info['snips_done'] = cls.sign_of_life(
                    job,
                    global_info['snips_done'],
                    args.outdir,
                    start_time,
                    force_rewrite=True)
示例#9
0
    def run_workers(cls, args, overwrite_policy=WhenAlreadyDone.ASK):
        '''
        Called by main to run the SpectrogramChopper in
        multiple processes at once. Pajcrtitions the
        audio files to be processed; runs the chopping
        while giving visual progress on terminal.
        
        Prints success/failure of each worker. Then
        returns

        :param args: all arguments provided to argparse
        :type args: {str : Any}
        '''
        
        in_dir = args.input
    
        # Get a list of lists of species names
        # to process. The list is computed such
        # that each worker has roughly the same
        # number of recordings to chop. We let
        # the method determine the number of workers
        # by using 80% of the available cores. 
        
        (worker_assignments, num_workers) = SpectrogramChopper.compute_worker_assignments(
            in_dir,
            num_workers=args.workers)
    
        print(f"Distributing workload across {num_workers} workers.")
        # Assign each list of species to one worker:
        
        chopping_jobs = []
        for ass_num, assignment in enumerate(worker_assignments):
            chopper = SpectrogramChopper(in_dir, 
                                   args.output_dir,
                                   overwrite_policy=overwrite_policy
                                   )
            ret_value_slot = mp.Value("b", False)
            job = ProcessWithoutWarnings(target=chopper.chop_from_file_list,
                                         args=([assignment, ret_value_slot]),
                                         name=f"ass# {ass_num}"
                                         )
            job.ret_val = ret_value_slot
            
            chopping_jobs.append(job)
            print(f"Starting chops for {job.name}")
            job.start()
        
        for job in chopping_jobs:
            job_done = False
            while not job_done:
                # Check for job done with one sec timeout: 
                job.join(1)
                # Get number of generated snippets:
                num_chopped_snippets = \
                    len(utils.find_in_dir_tree(SpectrogramChopper.spectrogram_dir_path))
                # Keep printing number of done snippets in the same
                # terminal line:
                print(f"Number of audio snippets: {num_chopped_snippets}", end='\r')
                # If the call to join() timed out
                if job.exitcode is None:
                    # Job not done:
                    continue
                res = "OK" if job.ret_val else "Error"
                # New line after the progress msgs:
                print("")
                print(f"Chops of {job.name}/{num_workers}: {res}")
                job_done = True