def test_compute_worker_assignments_one_spectro_done(self): # Scenario: one spectro was already done: with tempfile.TemporaryDirectory(dir='/tmp', prefix='test_spectro') as dst_dir: # Fake-create an existing spectrogram: os.mkdir(os.path.join(dst_dir, 'HENLES_S')) done_spectro_path = os.path.join(dst_dir, 'HENLES_S/SONG_Henicorhinaleucosticta_xc259378.png') Path(done_spectro_path).touch() num_tasks_done = len(Utils.find_in_dir_tree( dst_dir, pattern='*.png', entry_type='file')) true_num_assignments = self.num_sound_files - num_tasks_done self.verify_worker_assignments(self.sound_root, dst_dir, WhenAlreadyDone.SKIP, true_num_assignments) # We are to overwrite existing files, # all sound files will need to be done: true_num_assignments = self.num_sound_files self.verify_worker_assignments(self.sound_root, dst_dir, WhenAlreadyDone.OVERWRITE, true_num_assignments)
def setUpClass(cls): cls.cur_dir = os.path.dirname(__file__) cls.sound_root = os.path.join(cls.cur_dir, 'sound_data') # Number of cores to use: num_cores = mp.cpu_count() cls.num_workers = round(num_cores * Utils.MAX_PERC_OF_CORES_TO_USE / 100) cls.num_sound_files = len(Utils.find_in_dir_tree( cls.sound_root, pattern='*.mp3', entry_type='file')) cls.assignments = np.array( [[('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259380.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259381.mp3') ], [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259383.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259379.mp3') ], [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259378.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259384.mp3') ], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc513.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc518466.mp3')], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc531750.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc50519.mp3')], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc511477.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc548015.mp3') ] ])
def sign_of_life(cls, job, num_already_present_imgs, outdir, start_time, force_rewrite=False): # Time for sign of life? now_time = datetime.datetime.now() time_duration = now_time - start_time # Every 3 seconds, but at least 3: if force_rewrite \ or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0): # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(time_duration, granularity=4) # Get current and new spectro imgs in outdir: num_now_present_imgs = len( Utils.find_in_dir_tree(outdir, pattern="*.png")) num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs # Keep printing number of done snippets in the same # terminal line: print((f"{job.name}---Number of spectros: {num_now_present_imgs} " f"({num_newly_present_imgs} new) after {duration_str}"), end='\r') return num_newly_present_imgs else: return num_already_present_imgs
def cull(self, dir_root, fextension): # Good recording code ranges: good_rec_id_rngs = [range(50000, 70001), range(170000, 180001), ] # Get all audio file paths relative # to dir_root: pattern = f'*{fextension}' wav_paths = Utils.find_in_dir_tree(dir_root, pattern=pattern) #********* # wav_paths = ['/foo/bar/AM01_20190711_049999.wav', # no # '/foo/bar/AM01_20190711_050000.wav', # yes # '/foo/bar/AM01_20190711_070000.wav', # yes # '/foo/bar/AM01_20190711_070001.wav', # no # '/foo/bar/AM01_20190711_169999.wav', # no # '/foo/bar/AM01_20190711_170000.wav', # yes # '/foo/bar/AM01_20190711_170001.wav', # no # ] # #********* # Get just the filename without parents # and extension: to_delete = [] for aud_path in wav_paths: ser_num = self.extract_ser_num(aud_path) if ser_num in good_rec_id_rngs[0] \ or ser_num in good_rec_id_rngs[1]: continue else: to_delete.append(aud_path) print(f"Examined {len(wav_paths)} {pattern} files...") if len(to_delete) > 0: if Utils.user_confirm(f"List the {len(to_delete)} bad files? (n/Y)", default='Y'): for fpath in to_delete: print(f"{os.path.getsize(fpath)} bytes: {fpath}") if Utils.user_confirm(f"Delete {len(to_delete)} aud files? (N/y):", default='N'): num_deleted = 0 for fname in to_delete: try: os.remove(fname) except Exception as e: print(f"Could not delete {fname}: {repr(e)}") else: num_deleted += 1 print(f"Removed {num_deleted} files") else: print('Canceling') else: print('No files are out of good recorder serial number ranges')
def chop_all(self): ''' Workhorse: Assuming self.in_dir is root of all species audio samples: self.in_dir Species1 Species2 ... Speciesn smpl1_1.mp3 smpl2_1.mp3 smpln_1.mp3 smpl1_2.mp3 smpl2_2.mp3 smpln_2mp3 ... Chops each .mp3 (or .wav) file into window_len snippets. Saves those snippets in a new directory. Creates a spectrogram for each snippet, and saves those in a different, new directory. Resulting directories under self.out_dir will be: self.out_dir spectrograms wav-files If self.specific_species is None, audio files under all species are chopped. Else, self.specific_species is expected to be a list of species names that correspond to the names of species directories above: Species1, Species2, etc. Returns a 2-tuple: (number of created .wav audio snippet files, number of created .png spectrogram snippet files, ''' for species in self.species_list: audio_files = os.listdir(os.path.join(self.in_dir, species)) num_files = len(audio_files) for i, sample_name in enumerate(audio_files): # Chop one audio file: self.log.info(f"Chopping {species} audio {i}/{num_files}") self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir) self.num_chopped += num_files num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png') num_audios = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav') return (num_audios, num_spectros)
def setUpClass(cls): super(TestChopSpectrograms, cls).setUpClass() cls.skip_size = 2 # sec cls.cur_dir = os.path.dirname(__file__) cls.spectro_root = os.path.join(cls.cur_dir, 'spectro_data_long') cls.spectro_file = os.path.join(cls.spectro_root, 'DOVE/dove_long.png') cls.num_spectro_files = len(Utils.find_in_dir_tree( cls.spectro_root, pattern='*.png', entry_type='file')) _spectro, metadata = SoundProcessor.load_spectrogram(cls.spectro_file) try: cls.duration = float(metadata['duration']) except KeyError: raise AssertionError(f"Spectrogram test file {os.path.basename(cls.spectro_file)} has no duration metadata") cls.default_win_len = 5 # seconds
def test_from_commandline(self): with tempfile.TemporaryDirectory(dir='/tmp', prefix='test_spectro') as dst_dir: args = Arguments() args.input = self.spectro_root args.outdir = dst_dir args.workers = None # Number of spectrogram .png files # in source tree: spectros_to_chop = Utils.find_in_dir_tree(self.spectro_root, '*.png') manager = mp.Manager() global_info = manager.dict() global_info['jobs_status'] = manager.list() # ------ Chop spectrograms: SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.OVERWRITE ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] num_spectros_done = sum([len(Utils.find_in_dir_tree(one_filled_dir, '*.png')) for one_filled_dir in dirs_filled]) self.assertTrue(num_spectros_done > len(spectros_to_chop)) self.check_spectro_sanity(dirs_filled) # Remember the creation times: file_times = self.record_creation_times(dirs_filled) # ------ SKIP the existing spectrograms: # Run again, asking to skip already existing # spectros: global_info = manager.dict() global_info['jobs_status'] = manager.list() SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.SKIP ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] # Mod times of png files must NOT have changed, # b/c of skipping new_file_times = self.record_creation_times(dirs_filled) self.assertDictEqual(new_file_times, file_times) # ------ Force RECREATION of spectrograms: # Run again with OVERWRITE, forcing the # spectros to be done again: global_info = manager.dict() global_info['jobs_status'] = manager.list() SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.OVERWRITE ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] self.check_spectro_sanity(dirs_filled) # File times must be *different* from previous # run because we asked to overwrite: new_file_times = self.record_creation_times(dirs_filled) for fname in file_times.keys(): try: self.assertTrue(new_file_times[fname] != file_times[fname]) except KeyError as e: print(repr(e))
def run_workers(cls, args, global_info, overwrite_policy=WhenAlreadyDone.ASK): ''' Called by main to run the SpectrogramChopper in multiple processes at once. Partitions the audio files to be processed; runs the chopping while giving visual progress on terminal. Prints success/failure of each worker. Then returns. In order to avoid processes repeatedly reporting the same, or only locally kept info, the globally visible dict `global_info` is passed in. This method will add these key/val pairs: 1 The total number of spectros to chop (key 'num_tasks') 2 The number of already created snippets (key 'num_snips') 3 A list with values False for each job, indicating that the corresponding job is not yet done (key 'jobs_status') Processes will update 2 and 3 as they report progress: :param args: all arguments provided to argparse :type args: {str : Any} :param global_info: interprocess communication dict for reporting progress :type global_info: multiprocessing.manager.dict ''' # Get a list of lists of species names # to process. The list is computed such # that each worker has roughly the same # number of recordings to chop. We let # the method determine the number of workers # by using 80% of the available cores. (worker_assignments, num_workers) = SpectrogramChopper.compute_worker_assignments( args.input, args.outdir, num_workers=args.workers) print(f"Distributing workload across {num_workers} workers.") # Initialize the dict with shared information: # Fill the inter-process list with False. # Will be used to logging jobs finishing # many times to the console (i.e. not used # for functions other than reporting progress: for _i in range(num_workers): # NOTE: reportedly one cannot just set the passed-in # list to [False]*num_workers, b/c # a regular python list won't be # seen by other processes, even if # embedded in a multiprocessing.manager.list # instance: global_info['jobs_status'].append(False) # Number of full spectrograms to chop: global_info['snips_to_do'] = len( Utils.find_in_dir_tree(args.input, pattern="*.png")) # For progress reports, get number of already # existing .png files in out directory: global_info['snips_done'] = len( Utils.find_in_dir_tree(args.outdir, pattern="*.png")) # Assign each list of species to one worker: chopping_jobs = [] for ass_num, assignment in enumerate(worker_assignments): chopper = SpectrogramChopper(args.input, args.outdir, overwrite_policy=overwrite_policy) ret_value_slot = mp.Value("b", False) job = ProcessWithoutWarnings( target=chopper.chop_from_file_list, args=( assignment, args.input, args.outdir, global_info, # ***NEW overwrite_policy, ret_value_slot), name=f"ass# {ass_num}") job.ret_val = ret_value_slot chopping_jobs.append(job) print(f"Starting chops for {job.name}") job.start() start_time = datetime.datetime.now() # Keep checking on each job, until # all are done as indicated by all jobs_done # values being True, a.k.a valued 1: while sum(global_info['jobs_status']) < num_workers: for job_idx, job in enumerate(chopping_jobs): # Timeout 1 sec job.join(1) if job.exitcode is not None: if global_info['jobs_status'][job_idx]: # One of the processes has already # reported this job as done. Don't # report it again: continue # Let other processes know that this job # is done, and they don't need to report # that fact: we'll do it here below: global_info['jobs_status'][job_idx] = True # This job finished, and that fact has not # been logged yet to the console: res = "OK" if job.ret_val else "Error" # New line after the single-line progress msgs: print("") print( f"Worker {job.name}/{num_workers} finished with: {res}" ) global_info['snips_done'] = cls.sign_of_life( job, global_info['snips_done'], args.outdir, start_time, force_rewrite=True) # Check on next job: continue # This job not finished yet. # Time for sign of life? global_info['snips_done'] = cls.sign_of_life( job, global_info['snips_done'], args.outdir, start_time, force_rewrite=True)
def run_workers(cls, args, overwrite_policy=WhenAlreadyDone.ASK): ''' Called by main to run the SpectrogramChopper in multiple processes at once. Pajcrtitions the audio files to be processed; runs the chopping while giving visual progress on terminal. Prints success/failure of each worker. Then returns :param args: all arguments provided to argparse :type args: {str : Any} ''' in_dir = args.input # Get a list of lists of species names # to process. The list is computed such # that each worker has roughly the same # number of recordings to chop. We let # the method determine the number of workers # by using 80% of the available cores. (worker_assignments, num_workers) = SpectrogramChopper.compute_worker_assignments( in_dir, num_workers=args.workers) print(f"Distributing workload across {num_workers} workers.") # Assign each list of species to one worker: chopping_jobs = [] for ass_num, assignment in enumerate(worker_assignments): chopper = SpectrogramChopper(in_dir, args.output_dir, overwrite_policy=overwrite_policy ) ret_value_slot = mp.Value("b", False) job = ProcessWithoutWarnings(target=chopper.chop_from_file_list, args=([assignment, ret_value_slot]), name=f"ass# {ass_num}" ) job.ret_val = ret_value_slot chopping_jobs.append(job) print(f"Starting chops for {job.name}") job.start() for job in chopping_jobs: job_done = False while not job_done: # Check for job done with one sec timeout: job.join(1) # Get number of generated snippets: num_chopped_snippets = \ len(utils.find_in_dir_tree(SpectrogramChopper.spectrogram_dir_path)) # Keep printing number of done snippets in the same # terminal line: print(f"Number of audio snippets: {num_chopped_snippets}", end='\r') # If the call to join() timed out if job.exitcode is None: # Job not done: continue res = "OK" if job.ret_val else "Error" # New line after the progress msgs: print("") print(f"Chops of {job.name}/{num_workers}: {res}") job_done = True