Пример #1
0
    def test_compute_num_augs_per_species(self):

        # Get
        #           num_samples
        #      foo           10
        #      bar           25
        #      fum           50
        aug_goals = AugmentationGoals.MEDIAN

        population = pd.DataFrame.from_dict({
            'foo': 10,
            'bar': 25,
            'fum': 50
        },
                                            orient='index',
                                            columns=['num_samples'])

        Utils.compute_num_augs_per_species(aug_goals, population)
        num_samples = population.loc[:, 'num_samples']
        med = num_samples.median()
        print(f"Median: {med}")

        # species foo must receive med-10  = 15           augmentations
        #         bar              med-25  =  0           augmentations
        #         fum              med-50  = -25 --> 0    augmentations

        truth = {'foo': 15, 'bar': 0, 'fum': 0}
        res = Utils.compute_num_augs_per_species(aug_goals, population)
        self.assertDictEqual(truth, res)
Пример #2
0
    def create_dest_dirs(self, species_list):
        '''
        Creates all directories that will hold new 
        spectrogram snippets for each species.
        For each directory: if dir exists:
        
           o if overwrite_policy is True, wipe the dir
           o if overwrite_policy is SKIP, leave the
               directory in place, contents intact 
           o else ask user. 
                If response is Yes, wipe the dir
                else raise FileExistsError
                
        :param species_list: names of species to process
        :type species_list: [str]
        :return: top level dir for spectrograms (same as self.out_dir)
        :rtype: (str)
        :raise FileExistsError: if a dest dir exists and not allowed
            to wipe it.
        '''

        # Root dir of each species' spectro snippets:
        Utils.create_folder(self.out_dir,
                            overwrite_policy=self.overwrite_policy)

        # One dir each for the spectrogram snippets of one species:

        for species in species_list:
            species_spectros_dir = os.path.join(self.out_dir, species)
            if not Utils.create_folder(species_spectros_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(
                    f"Target dir {species_spectros_dir} exists; aborting")

        return self.out_dir
Пример #3
0
    def cull(self, dir_root, fextension):
        
        # Good recording code ranges:
        good_rec_id_rngs = [range(50000, 70001),
                            range(170000, 180001),
                            ] 
        
        # Get all audio file paths relative
        # to dir_root:
        
        pattern = f'*{fextension}' 
        wav_paths = Utils.find_in_dir_tree(dir_root, 
                                           pattern=pattern)
                                           
        
        #*********
        # wav_paths = ['/foo/bar/AM01_20190711_049999.wav', # no
        #   		   '/foo/bar/AM01_20190711_050000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_070000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_070001.wav', # no
        #   		   '/foo/bar/AM01_20190711_169999.wav', # no
        #   		   '/foo/bar/AM01_20190711_170000.wav', # yes
        #   		   '/foo/bar/AM01_20190711_170001.wav', # no
        # ]
        # #********* 
        # Get just the filename without parents
        # and extension:
        to_delete = []
        for aud_path in wav_paths:
            ser_num = self.extract_ser_num(aud_path)
            if   ser_num in good_rec_id_rngs[0] \
              or ser_num in good_rec_id_rngs[1]:
                continue
            else:
                to_delete.append(aud_path)

        print(f"Examined {len(wav_paths)} {pattern} files...")
        if len(to_delete) > 0:
            
            if Utils.user_confirm(f"List the {len(to_delete)} bad files? (n/Y)", default='Y'):
                for fpath in to_delete:
                    print(f"{os.path.getsize(fpath)} bytes: {fpath}")

            if Utils.user_confirm(f"Delete {len(to_delete)} aud files? (N/y):", default='N'):
                num_deleted = 0
                for fname in to_delete:
                    try:
                        os.remove(fname)
                    except Exception as e:
                        print(f"Could not delete {fname}: {repr(e)}")
                    else:
                        num_deleted += 1
                        
                print(f"Removed {num_deleted} files")
            else:
                print('Canceling')
        else:
            print('No files are out of good recorder serial number ranges')
Пример #4
0
    def test_compute_num_augs_per_species(self):

        aug_volumes = AugmentationGoals.MAX
        sample_distrib_df = Utils.sample_compositions_by_species(
            self.spectros_dir)
        augs_to_do = Utils.compute_num_augs_per_species(
            aug_volumes, sample_distrib_df)
        self.assertEqual(augs_to_do['AMADEC'], 4)
        self.assertEqual(augs_to_do['FORANA'], 0)
Пример #5
0
    def test_sample_compositions_by_species(self):

        dist_df = Utils.sample_compositions_by_species(self.spectros_dir)
        #truth =  pd.DataFrame.from_dict({'AMADEC' : 1, 'FORANA' : 5}, orient='index', columns=['num_samples'])
        self.assertListEqual(list(dist_df.columns), ['num_samples'])
        self.assertEqual(int(dist_df.loc['AMADEC']), 1)
        self.assertEqual(int(dist_df.loc['FORANA']), 5)
Пример #6
0
    def prep_aug_tmp_dirs(self, dst_tmp_dir):
        '''
        Copies AMADEC single-spectrogram directory,
        and FORANA 5-spectrogram dir to the given tmp 
        dir. Creates dir 'aug_spectros' in that same
        tmp dir. Returns path to that aug_spectros
        dir.
         
        :param dst_tmp_dir: temporary directory
        :type dst_tmp_dir: src
        :return: output directory for future spectro augmentations
        :rtype: str
        '''

        # Do all testing in the tmp dir, where
        # all files/dirs will be deleted automatically:

        for species_dir in Utils.listdir_abs(self.full_species_root):
            species_name = Path(species_dir).stem
            dst_species_dir = os.path.join(dst_tmp_dir, species_name)
            shutil.copytree(species_dir, dst_species_dir)

        # Dir where augmentations are to be placed,
        # one subdir per species:
        out_dir = os.path.join(dst_tmp_dir, 'aug_spectros')
        os.mkdir(out_dir)
        return out_dir
Пример #7
0
    def test_chop_one_spectrogram_file(self):
        
        with tempfile.TemporaryDirectory(dir='/tmp',
                                         prefix='chopping', 
                                         ) as dir_nm:
            chopper = SpectrogramChopper(
                self.spectro_root,
                dir_nm,
                overwrite_policy=WhenAlreadyDone.OVERWRITE
                )
            species = Path(self.spectro_file).parent.stem
            outdir  = os.path.join(dir_nm, species)
            true_snippet_time_width = chopper.chop_one_spectro_file(
                self.spectro_file,
                outdir,
                'DOVE',
                skip_size=self.skip_size
                )
            snippet_names = os.listdir(outdir)
            num_expected_snippets = 0
            cur_time = true_snippet_time_width
            while cur_time < self.duration:
                num_expected_snippets += 1
                cur_time += self.skip_size

            self.assertEqual(len(snippet_names), num_expected_snippets)
            
            # Check embedded metadata of one snippet:
            
            _spectro, metadata = SoundProcessor.load_spectrogram(Utils.listdir_abs(outdir)[0])
            self.assertEqual(round(float(metadata['duration(secs)']), 3),
                             round(true_snippet_time_width, 3)
                             )
            self.assertEqual(metadata['species'], 'DOVE')
Пример #8
0
    def sign_of_life(cls,
                     job,
                     num_already_present_imgs,
                     outdir,
                     start_time,
                     force_rewrite=False):

        # Time for sign of life?
        now_time = datetime.datetime.now()
        time_duration = now_time - start_time
        # Every 3 seconds, but at least 3:
        if force_rewrite \
           or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0):

            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(time_duration,
                                                    granularity=4)

            # Get current and new spectro imgs in outdir:
            num_now_present_imgs = len(
                Utils.find_in_dir_tree(outdir, pattern="*.png"))
            num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs

            # Keep printing number of done snippets in the same
            # terminal line:
            print((f"{job.name}---Number of spectros: {num_now_present_imgs} "
                   f"({num_newly_present_imgs} new) after {duration_str}"),
                  end='\r')
            return num_newly_present_imgs
        else:
            return num_already_present_imgs
Пример #9
0
    def test_compute_worker_assignments_one_spectro_done(self):

        # Scenario: one spectro was already done:
        with tempfile.TemporaryDirectory(dir='/tmp', 
                                         prefix='test_spectro') as dst_dir:
            
            # Fake-create an existing spectrogram: 
            os.mkdir(os.path.join(dst_dir, 'HENLES_S'))
            done_spectro_path = os.path.join(dst_dir, 
                                             'HENLES_S/SONG_Henicorhinaleucosticta_xc259378.png')
            Path(done_spectro_path).touch()
            
            num_tasks_done = len(Utils.find_in_dir_tree(
                dst_dir,
                pattern='*.png', 
                entry_type='file'))

            true_num_assignments = self.num_sound_files - num_tasks_done

            self.verify_worker_assignments(self.sound_root,
                                           dst_dir, 
                                           WhenAlreadyDone.SKIP, 
                                           true_num_assignments)
            
            # We are to overwrite existing files, 
            # all sound files will need to be done:
            
            true_num_assignments = self.num_sound_files
            self.verify_worker_assignments(self.sound_root,
                                           dst_dir, 
                                           WhenAlreadyDone.OVERWRITE, 
                                           true_num_assignments)
Пример #10
0
    def setUpClass(cls):
        cls.cur_dir     = os.path.dirname(__file__)
        cls.sound_root  = os.path.join(cls.cur_dir, 'sound_data')
        # Number of cores to use:
        num_cores = mp.cpu_count()
        cls.num_workers = round(num_cores * Utils.MAX_PERC_OF_CORES_TO_USE  / 100)
        
        cls.num_sound_files = len(Utils.find_in_dir_tree(
            cls.sound_root, 
            pattern='*.mp3', 
            entry_type='file'))
        
        cls.assignments = np.array(
		     [[('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259380.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259381.mp3')
		      ],
		     [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259383.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259379.mp3')
		      ],
		     [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259378.mp3'),
		      ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259384.mp3')
		      ],
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc513.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc518466.mp3')],
		    
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc531750.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc50519.mp3')],
		    
		     [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc511477.mp3'),
		      ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc548015.mp3')
              ]
             ])        
Пример #11
0
    def test_binary_in_interval_search(self):

        intervals = [Interval(1, 3), Interval(4, 5), Interval(6, 7)]

        res = Utils.binary_in_interval_search(intervals, 0, 'low_val',
                                              'high_val')
        assert (res == -1)

        res = Utils.binary_in_interval_search(intervals, 1, 'low_val',
                                              'high_val')
        assert (res == 0)

        res = Utils.binary_in_interval_search(intervals, 2, 'low_val',
                                              'high_val')
        assert (res == 0)

        res = Utils.binary_in_interval_search(intervals, 3, 'low_val',
                                              'high_val')
        assert (res == -1)

        res = Utils.binary_in_interval_search(intervals, 4, 'low_val',
                                              'high_val')
        assert (res == 1)

        res = Utils.binary_in_interval_search(intervals, 5, 'low_val',
                                              'high_val')
        assert (res == -1)

        res = Utils.binary_in_interval_search(intervals, 8, 'low_val',
                                              'high_val')
        assert (res == -1)
Пример #12
0
    def chop_all(self):
        '''
        Workhorse: Assuming self.in_dir is root of all
        species audio samples:
        
                        self.in_dir
                        
          Species1        Species2   ...     Speciesn
           smpl1_1.mp3      smpl2_1.mp3         smpln_1.mp3
           smpl1_2.mp3      smpl2_2.mp3         smpln_2mp3
                            ...
                            
        Chops each .mp3 (or .wav) file into window_len snippets.
        Saves those snippets in a new directory. Creates a spectrogram 
        for each snippet, and saves those in a different, new directory.
        
        Resulting directories under self.out_dir will be:
         
                         self.out_dir
            spectrograms               wav-files
            
        If self.specific_species is None, audio files under all
        species are chopped. Else, self.specific_species is 
        expected to be a list of species names that correspond
        to the names of species directories above: Species1, Species2, etc.
        
        Returns a 2-tuple: (number of created .wav audio snippet files,
                            number of created .png spectrogram snippet files,
        
        '''
        for species in self.species_list:
            audio_files = os.listdir(os.path.join(self.in_dir, species))
            num_files   = len(audio_files)
            for i, sample_name in enumerate(audio_files):
                # Chop one audio file:
                self.log.info(f"Chopping {species} audio {i}/{num_files}")
                self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir)
            self.num_chopped += num_files

        num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png')
        num_audios   = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav')
        return (num_audios, num_spectros)
Пример #13
0
 def setUpClass(cls):
     cls.cur_dir = os.path.dirname(__file__)
     cls.model_path = os.path.join(
         cls.cur_dir, 
         '../../birdsong/tests/models/mod_2021-05-04T13_02_14_net_resnet18_pre_True_frz_0_lr_0.01_opt_SGD_bs_128_ks_7_folds_10_gray_True_classes_34_ep9.pth'
         )
     cls.snips_dir = os.path.join(
         cls.cur_dir,
         '../../birdsong/utils/tests/data/fld_snippets'
         )
     cls.example_img_path = Utils.listdir_abs(cls.snips_dir)[0]
Пример #14
0
    def test_listdir_abs(self):

        # Get the built-in directory listing
        # with just the file names:
        nearly_truth = os.listdir(self.cur_dir)

        abs_paths = Utils.listdir_abs(self.cur_dir)
        self.assertEquals(len(nearly_truth), len(abs_paths))

        # Check existence of first file or dir:
        self.assertTrue(os.path.exists(abs_paths[0]))
Пример #15
0
    def test_find_in_tree_gen(self):

        res = list(Utils.find_in_tree_gen(self.spectros_dir, pattern='*.png'))
        expected = [
            f"{self.spectros_dir}/AMADEC/Amaziliadecora1061880.png",
            f"{self.spectros_dir}/FORANA/SONG_XC609364-41759.png",
            f"{self.spectros_dir}/FORANA/SONG_XC253440-FORANA04.png",
            f"{self.spectros_dir}/FORANA/SONG_XC520628-passarochao.png",
            f"{self.spectros_dir}/FORANA/SONG_XC360575-BFAN.png",
            f"{self.spectros_dir}/FORANA/SONG_XC171241-Formicarius_analis.png"
        ]
        self.assertSetEqual(set(res), set(expected))
Пример #16
0
    def test_orig_file_name(self):

        # Identity:
        aug_nm = "foo.wav"
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, aug_nm)

        aug_nm = "Amaziliadecora1061880-volume-10.wav"
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, 'Amaziliadecora1061880.wav')

        aug_nm = 'Amaziliadecora1061883-rain_bgd0ms.wav'
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, 'Amaziliadecora1061883.wav')

        aug_nm = 'Amaziliadecora1061886-shift4600ms.wav'
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, 'Amaziliadecora1061886.wav')

        # With directory relative:
        aug_nm = 'foo/bar/Amaziliadecora1061886-shift4600ms.wav'
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, 'foo/bar/Amaziliadecora1061886.wav')

        # With directory absolute:
        aug_nm = '/foo/bar/Amaziliadecora1061886-shift4600ms.wav'
        orig = Utils.orig_file_name(aug_nm)
        self.assertEquals(orig, '/foo/bar/Amaziliadecora1061886.wav')
Пример #17
0
 def check_spectro_sanity(self, dirs_filled):
     '''
     Raises assertion error if any file in
     the passed-in list of directories is less than
     5000 bytes long
     
     :param dirs_filled: list of directories whose content
         files to check for size
     :type dirs_filled: [str]
     '''
     # Check that each spectro is of
     # reasonable size:
     for species_dst_dir in dirs_filled:
         for spec_file in Utils.listdir_abs(species_dst_dir):
             self.assertTrue(os.stat(spec_file).st_size > 5000)
Пример #18
0
    def record_creation_times(self, dirs_filled):
        '''
        Given list of absolute file paths, return 
        a dict mapping each path to a Unix modification time
        in fractional epoch seconds
        
        :param dirs_filled: list of absolute file paths
        :type dirs_filled: [str]
        :return dict of modification times
        :rtype {str : float}
        '''
        
        file_times = {}
        for species_dst_dir in dirs_filled:
            for spec_fname in Utils.listdir_abs(species_dst_dir):
                file_times[spec_fname] = os.path.getmtime(spec_fname)

        return file_times
Пример #19
0
    def test_generate_all_augmentations_max(self):
        with tempfile.TemporaryDirectory(dir='/tmp',
                                         prefix='test_spectro') as dst_dir:

            out_dir = self.prep_aug_tmp_dirs(dst_dir)
            # Tell the augmenter where the src and dest roots are:
            self.spectro_augmenter_max.input_dir_path = dst_dir
            self.spectro_augmenter_max.output_dir_path = out_dir

            # AMADEC has 1 spectro, FORANA has 5
            # MAX is 5, So AMADEC needs 4 augementation:

            num_augs_needed = 4

            self.spectro_augmenter_max.generate_all_augmentations()

            # Should have one directory in aug_spectros
            new_dirs = Utils.listdir_abs(out_dir)
            self.assertTrue(len(new_dirs) == 1)
            # AMADEC subdir should have 2 new files
            new_files = os.listdir(new_dirs[0])
            self.assertTrue(len(new_files), num_augs_needed)
Пример #20
0
    def setUpClass(cls):
        super(TestChopSpectrograms, cls).setUpClass()
        
        cls.skip_size = 2 # sec
        
        cls.cur_dir  = os.path.dirname(__file__)
        cls.spectro_root = os.path.join(cls.cur_dir, 
                                       'spectro_data_long')
        cls.spectro_file = os.path.join(cls.spectro_root, 'DOVE/dove_long.png')
        
        cls.num_spectro_files = len(Utils.find_in_dir_tree(
            cls.spectro_root, 
            pattern='*.png', 
            entry_type='file'))

        _spectro, metadata = SoundProcessor.load_spectrogram(cls.spectro_file)
        try:
            cls.duration      = float(metadata['duration'])
        except KeyError:
            raise AssertionError(f"Spectrogram test file {os.path.basename(cls.spectro_file)} has no duration metadata")
        
        cls.default_win_len = 5 # seconds
Пример #21
0
    def create_dest_dirs(self, species_list):
        '''
        Creates all directories that will hold new 
        audio snippets and spectrograms for each species.
        For each directory: if dir exists:
           o if overwrite_policy is True, wipe the dir
           o else ask user. 
                If response is Yes, wipe the dir
                else raise FileExistsError
                
        :param species_list: names of species to process
        :type species_list: [str]
        :return: top level dirs for audio snippets and spectrograms
        :rtype: (str)
        :raise FileExistsError: if a dest dir exists and not allowed
            to wipe it.
        '''

        # Root dir of the two dirs that will hold new 
        # audio snippet and spectrogram files
        utils.create_folder(self.out_dir, overwrite_policy=self.overwrite_policy)

        # Below the rootP
        spectrogram_dir_path = os.path.join(self.out_dir,'spectrograms/')
        wav_dir_path = os.path.join(self.out_dir,'wav-files/')

        if not utils.create_folder(spectrogram_dir_path, overwrite_policy=self.overwrite_policy):
            raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting")
        if not utils.create_folder(wav_dir_path, overwrite_policy=self.overwrite_policy):
            raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting")
        
        # One dir each for the audio and spectrogram
        # snippets of one species:
        
        for species in species_list:
            species_spectros_dir = os.path.join(spectrogram_dir_path, species)
            if not utils.create_folder(species_spectros_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(f"Target dir {species_spectros_dir} exists; aborting")
            
            species_audio_dir = os.path.join(wav_dir_path, species)
            if not utils.create_folder(species_audio_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(f"Target dir {species_audio_dir} exists; aborting")

        return(wav_dir_path, spectrogram_dir_path)
Пример #22
0
 def cull_spectro_paths(cls,
                        species_or_recorder_name,
                        dst_dir,
                        rec_paths,
                        overwrite_policy=WhenAlreadyDone.ASK):
     #******* DISABLED ************
     # method analogous to cull_rec_paths() in create_spectrograms()
     # Currently below is just a copy from create_spectrograms().
     # If we end up needing culling, update this body
     return rec_paths
     #******* DISABLED ************
     # NEVER REACHED
     new_rec_paths = []
     for aud_fname in rec_paths:
         fname_stem = Path(aud_fname).stem
         dst_path = os.path.join(dst_dir, species_or_recorder_name,
                                 f"{fname_stem}.png")
         if not os.path.exists(dst_path):
             # Destination spectrogram does not exist;
             # keep this audio file in the to-do list:
             new_rec_paths.append(aud_fname)
             continue
         if overwrite_policy == WhenAlreadyDone.OVERWRITE:
             os.remove(dst_path)
             new_rec_paths.append(aud_fname)
             continue
         if overwrite_policy == WhenAlreadyDone.SKIP:
             # Don't even assign audio file to a worker,
             # since its spectro already exists:
             continue
         if overwrite_policy == WhenAlreadyDone.ASK:
             if Utils.user_confirm(
                     f"Spectrogram for {dst_path} exists; overwrite?"):
                 os.remove(dst_path)
                 new_rec_paths.append(aud_fname)
                 continue
     return new_rec_paths
Пример #23
0
    def __init__(self, 
                 input_dir_path,
                 plot=False,
                 overwrite_policy=False,
                 aug_goals=AugmentationGoals.MEDIAN,
                 random_augs = False,
                 multiple_augs = False,):

        '''
        
        :param input_dir_path: directory holding .wav files
        :type input_dir_path: str
        :param plot: whether or not to plot informative chars 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals; TENTH/MAX/MEDIAN)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        :param random_augs: if this is true, will randomly choose augmentation 
            to use for each new sample
        :type random_augs: bool
        :param multiple_augs: if we want to allow multiple augmentations per sample 
            (e.g. time shift and volume)):
        :type multiple_augs: bool
        '''

        self.log = LoggingService()

        if not isinstance(overwrite_policy, WhenAlreadyDone):
            raise TypeError(f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}") 

        if not os.path.isabs(input_dir_path):
            raise ValueError(f"Input path must be a full, absolute path; not {input_dir_path}")

        self.input_dir_path   = input_dir_path
        self.multiple_augs    = multiple_augs
        self.plot             = plot
        self.overwrite_policy = overwrite_policy
        
        self.species_names = Utils.find_species_names(self.input_dir_path)

        # If aug_goals is not a dict mapping
        # each species to an aug_goals, but just
        # a single AugmentationGoals, create
        # a dict from all bird species, mapping
        # each to that same value:
        
        if type(aug_goals) != dict:
            aug_goals = {species : aug_goals
                          for species in self.species_names
                          }

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_samples
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = Utils.sample_compositions_by_species(input_dir_path, 
                                                                      augmented=False)
        
        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = Utils.compute_num_augs_per_species(aug_goals, 
                                                             self.sample_distrib_df)
        
        # Get input dir path without trailing slash:
#****        canonical_in_path = str(Path(input_dir_path))
        # Create the descriptive name of an output directory 
        # for the augmented samples: 
        if random_augs:
            os.path.join(Path(input_dir_path).parent, 'augmented_samples_random')
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, 
                                                'augmented_samples_random')
        else:
            assert(self.ADD_NOISE + self.TIME_SHIFT + self.VOLUME == 1)
            dir_nm = f"Augmented_samples_-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.VOLUME:.2f}w"
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, dir_nm)

        if self.multiple_augs:
            self.output_dir_path += "/"
        else:
            # Indicate that augmentations are mutually exclusive
            self.output_dir_path += "-exc/"  

        self.log.info(f"Results will be in {self.output_dir_path}")

        Utils.create_folder(self.output_dir_path, self.overwrite_policy)

        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        warnings.filterwarnings(action="ignore",
                                message="PySoundFile failed. Trying audioread instead.",
                                category=UserWarning, 
                                module='', 
                                lineno=0)
Пример #24
0
    def augment_one_species(self, in_dir, out_dir, num_augs_to_do):
        '''
        Takes one species, and a number of audio
        augmentations to do. Generates the files,
        and returns a list of the newly created 
        files (full paths).
        
        The maximum number of augmentations created
        depends on the number of audio augmentation 
        methods available (currently 3), and the number
        of audio files available for the given species:
        
           num-available-audio-augs * num-of-audio-files
        
        If num_augs_to_do is higher than the above maximum,
        only that maximum is created. The rest will need to 
        be accomplished by spectrogram augmentation in a 
        different portion of the workflow.

        Augmentations are effectively done round robin across all of
        the species' audio files such that each file is
        augmented roughly the same number of times until
        num_augs_to_do is accomplished.

        :param in_dir: directory holding one species' audio files
        :type in_dir: str
        :param out_dir: destination for new audio files
        :type out_dir: src
        :param num_augs_to_do: number of augmentations
        :type num_augs_to_do: int
        :returns: list of newly created file paths
        :rtype: [src]
        '''
        
        # By convention, species name is the last part of the directory:
        species_name = Path(in_dir).stem
        
        # Create subfolder for the given species:
        if not Utils.create_folder(out_dir, self.overwrite_policy):
            self.log.info(f"Skipping augmentations for {species_name}")
            return []

        # Get dict: {full-path-to-an-audio_file : 0}
        # The zeroes will be counts of augmentations
        # needed for that file:    
        in_wav_files     = {full_in_path : 0
                            for full_in_path
                            in Utils.listdir_abs(in_dir)
                            } 
        # Cannot do augmentations for species with 0 samples
        if len(in_wav_files) == 0:
            self.log.info(f"Skipping for {species_name} since there are no original samples.")
            return []

        # Distribute augmenations across the original
        # input files:
        aug_assigned = 0
        while aug_assigned < num_augs_to_do:
            for fname in in_wav_files.keys():
                in_wav_files[fname] += 1
                aug_assigned += 1
                if aug_assigned >= num_augs_to_do:
                    break
        new_sample_paths = []
        failures = 0

        for in_fname, num_augs_this_file in in_wav_files.items():

            # Create augs with different methods:

            # Pick audio aug methods to apply (without replacement)
            # Note that if more augs are to be applied to each file
            # than methods are available, some methods will need
            # to be applied multiple times; no problem, as each
            # method includes randomness:
            max_methods_sample_size = min(len(list(AudAugMethod)), num_augs_this_file)
            methods = random.sample(list(AudAugMethod), max_methods_sample_size)
            
            # Now have something like:
            #     [volume, time-shift], or all methods: [volume, time-shift, noise]
            
            if num_augs_this_file > len(methods):
                # Repeat the methods as often as
                # needed:
                num_method_set_repeats = int(math.ceil(num_augs_this_file/len(methods)))
                # The slice to num_augs_this_file chops off
                # the possible excess from the array replication: 
                method_seq = (methods * num_method_set_repeats)[:num_augs_this_file]
                
                # Assuming num_augs_per_file is 7, we not have method_seq:
                #    [m1,m2,m3,m1,m2,m3,m1]
            else:
                method_seq = methods
                
            for method in method_seq:
                out_path_or_err = self.create_new_sample(in_fname, out_dir, method)
                if isinstance(out_path_or_err, Exception):
                    failures += 1
                else:
                    new_sample_paths.append(out_path_or_err)

        self.log.info(f"Audio aug report: {len(new_sample_paths)} new files; {failures} failures")
                
        return new_sample_paths, failures
Пример #25
0
 def img_generator(self, in_img_or_dir):
     
     if os.path.isfile(in_img_or_dir):
         return iter([in_img_or_dir])
     return Utils.find_in_tree_gen(in_img_or_dir, '*.png')
    def create_snips_gen_for_sel_tbls(self, snippets_src, sel_tables_src):
        '''
        Given one or more Raven selection tables, 
        and one or more recording snippet paths, return
        a dict:
        
               {<recording-id> : SelTblSnipsAssoc-inst<table-path, snippets-dir>}

        where recording-id is like AM01_20190719_063242; table-path
        is the full path to one selection table with the respective
        recording-id, and snippets-dir is root of a director containing
        the snippets covered in the recording. 
        
        Usage concept:
            o There are relatively few selection tables, since they
              are human-generated
            o There can be thousands of snippet .png files whose time spans
              are covered in one table
            o The data structure returned from this method can be
              used like this:
              
                    tbl_snips_match = create_snips_gen_for_sel_tbls('/foo/my_snips', '/bar/my_tbls')
                    
                    # For each selection table, work on the snippets
                    # that are covered by that table
                    
                    for rec_id in tbl_snips_match:
                        for snip_path in tbl_snips_match.snips_iterator():
                            <do something with spectrogram snippet>
        
        
        :param snippets_src: iterable over absolute paths to snippets,
            or the absolute path to a directory
        :type snippets_src: {Iterator(str) | str}
        :param sel_tables_src: absolute path to selection table, or path 
            to a directory that contains selection tables, or
            iterator over absolute paths to selection tables
        :type sel_tables_src: str
        :returned dict mapping recording ID to SelTblSnipsAssoc instances
        :rtype {str : SelTblSnipsAssoc}
        '''

        # Table paths may be an individual
        # file, a directory, or a generator
        # of absolute paths. Sanity checks:

        if type(sel_tables_src) == str:
            if not os.path.isabs(sel_tables_src):
                raise ValueError(
                    f"Table paths must be a generator, or an absolute path to a selection table or dir"
                )
            if os.path.isfile(sel_tables_src):
                sel_tables_src = [sel_tables_src]
            elif os.path.isdir(sel_tables_src):
                sel_tables_src = Utils.listdir_abs(sel_tables_src)
        # If not a string, sel_tables_src better be a generator:
        elif not isinstance(sel_tables_src, types.GeneratorType):
            raise ValueError(
                f"Table paths must be a generator, or an absolute path to a selection table or dir"
            )

        # Same checks for snippet location:
        if type(snippets_src) == str:
            if not os.path.isabs(snippets_src) \
                or not os.path.isdir(snippets_src):
                raise ValueError(
                    f"Snippet paths must be a generator, or an absolute path to a snippet dir"
                )
            snippets_src = iter(Utils.listdir_abs(snippets_src))
        # If not a string, snippets_src better be a generator:
        elif not isinstance(sel_tables_src, types.GeneratorType):
            raise ValueError(
                f"Snippets src must be a generator, or an absolute path to dir"
            )

        # Build a dict:
        #    {<recording_id> : <dir-of-matching-snippets>}
        recording_selection_tables = {}
        for table_path in sel_tables_src:
            recording_id = self.extract_recording_id(table_path)
            if recording_id is not None:
                recording_selection_tables[recording_id] = \
                    SelTblSnipsAssoc(table_path, snippets_src)

        return recording_selection_tables
    def __init__(self,
                 selection_tbl_loc,
                 spectrogram_locs,
                 out_dir,
                 unittesting=False):
        '''
        Create snippet copies into out_dir 
        for all snippets that are covered
        by any of the given selection tables.
        
        :param selection_tbl_loc: path to individual selection
            table or a directory containing selection tables.
            Each tbl is a tsv file with extension .txt
        :type selection_tbl_loc: str
        :param spectrogram_locs: individual or directory of 
            spectrogram snippets.
        :type spectrogram_locs: str
        :param out_dir: destination of snippet copies
        :type out_dir: src
        :param unittesting: if True, does not initialize
            the instance, or run any operations
        :type unittesting: bool
        '''

        if unittesting:
            return

        if not os.path.exists(selection_tbl_loc):
            print(f"Cannot open {selection_tbl_loc}")
            sys.exit(1)

        if not os.path.exists(spectrogram_locs):
            print(f"Spectrogram snippets {spectrogram_locs} not found")
            sys.exit(1)

        # Is path to sel tbl an individual tsv file?
        if os.path.isfile(selection_tbl_loc):
            table_paths = iter([selection_tbl_loc])
        else:
            # Caller gave directory of .csv files.
            # Get them all recursively:
            table_paths = Utils.find_in_tree_gen(selection_tbl_loc,
                                                 pattern="*.txt")

        # Is snippets path to an individual .png snippet file?
        if os.path.isfile(spectrogram_locs):
            snippet_paths = iter([spectrogram_locs])
        else:
            # Caller gave directory of .png files.
            # Get them all recursively:
            snippet_paths = Utils.find_in_tree_gen(spectrogram_locs,
                                                   pattern="*.png")
        # If out_dir does not exist, create it,
        # and all dirs along the path:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # Get dict:
        #    {<recording-id> : SelTblSnipsAssoc-instance}
        # where each SelTblSnipsAssoc instance is a generator
        # of snippet metadata from snippet that are covered in
        # the selection table that is associated with the instance.
        # In addition the absolute snippet path is added as
        # entry of key 'snip_path'.
        #
        # The generator feeds out the snippet metadata in order of
        # start time.
        #
        # For brevity, call each instance of SelTblSnipsAssoc
        # an 'assoc'

        rec_id_assocs = self.create_snips_gen_for_sel_tbls(
            snippet_paths, table_paths)

        for assoc in rec_id_assocs.values():
            # The assoc focuses on a single selection
            # table, and the snippets it covers.
            # Get the info contained in each row of
            # the sel tb. This will be a list of dicts, each with
            # the information from one selection tbl row:

            selections = Utils.read_raven_selection_table(
                assoc.raven_sel_tbl_path)

            # Go through each snippet in the association, enrich its
            # metadata with species info. Then copy the enriched
            # snippet to the target dir:

            for snip_metadata in iter(assoc):
                self.match_snippet(selections, snip_metadata, out_dir)
Пример #28
0
    def add_background(cls,
                       file_name,
                       noise_path,
                       out_dir,
                       len_noise_to_add=5.0):
        '''
        Takes an absolute file path, and the path to a
        directory that contains noise to overlay onto the 
        given sound file (wind, rain, etc.).
        
        Returns a numpy structure corresponding to the
        original audio with the noise overlaid, plus the
        sample rate of the new sample. A file name is suggested
        for the sample. It is composed of elements such 
        as the nature and duration of the noise. Client
        may choose to ignore or use.

        :param file_name: absolute path to sound file
        :type file_name: str
        :param noise_path: absolute path to directory
            with noise files
        :type noise_path: str
        :param out_dir: destination directory of new audio file
        :type out_dir: str
        :param len_noise_to_add: how much of a noise snippet
            to overlay (seconds)
        :type len_noise_to_add: float
        :return: full path of new audio file
        :rtype: str
        '''

        len_noise_to_add = float(len_noise_to_add)
        backgrounds = os.listdir(noise_path)

        # Pick a random noise file:
        background_name = backgrounds[random.randint(0, len(backgrounds) - 1)]

        cls.log.info(f"Adding {background_name} to {file_name}.")

        # We will be working with 1 second as the smallest unit of time
        # load all of both wav files and determine the length of each
        noise, noise_sr = SoundProcessor.load_audio(
            os.path.join(noise_path,
                         background_name))  # type(noise) = np.ndarray
        orig_recording, orig_sr = SoundProcessor.load_audio(file_name)

        new_sr = math.gcd(noise_sr, orig_sr)
        if noise_sr != orig_sr:
            # Resample both noise and orig records so that they have same sample rate
            cls.log.info(f"Resampling: {background_name} and {file_name}")
            noise = librosa.resample(noise, noise_sr, new_sr)
            orig_recording = librosa.resample(orig_recording, orig_sr, new_sr)
            # input("ready?")

        noise_duration = librosa.get_duration(noise, noise_sr)
        if noise_duration < len_noise_to_add:
            cls.log.info(
                f"Duration:{noise_duration} < len_noise_to_add:{len_noise_to_add}. Will only add {noise_duration}s of noise"
            )
            samples_per_segment = len(noise)
        elif noise_duration >= len_noise_to_add:  # randomly choose noise segment
            samples_per_segment = int(
                new_sr * len_noise_to_add
            )  # this is the number of samples per 5 seconds
            # Place noise randomly:
            subsegment_start = random.randint(0,
                                              len(noise) - samples_per_segment)
            noise = noise[subsegment_start:subsegment_start +
                          samples_per_segment]
        cls.log.info(
            f"len(noise) after random segment: {len(noise)}; noise duration: {len(noise)/new_sr}"
        )

        orig_duration = librosa.core.get_duration(orig_recording, orig_sr)
        # if orig_recording is shorter than the noise we want to add, just add 5% noise
        if orig_duration < len_noise_to_add:
            cls.log.info(
                f"Recording: {file_name} was shorter than len_noise_to_add. Adding 5% of recording len worth of noise."
            )
            new_noise_len = orig_duration * 0.05
            noise = noise[:int(new_noise_len * new_sr)]
        noise_start_loc = random.randint(
            0,
            len(orig_recording) - samples_per_segment)
        cls.log.info(
            f"Inserting noise starting at {noise_start_loc/new_sr} seconds.")
        # split original into three parts: before_noise, during_noise, after_noise
        before_noise = orig_recording[:noise_start_loc]
        during_noise = orig_recording[noise_start_loc:noise_start_loc +
                                      samples_per_segment]
        after_noise = orig_recording[noise_start_loc + samples_per_segment:]

        assert len(during_noise) == len(noise)

        segment_with_noise = during_noise + Utils.noise_multiplier(
            orig_recording, noise) * noise
        first_half = np.concatenate((before_noise, segment_with_noise))
        new_sample = np.concatenate(
            (first_half, after_noise))  # what i think it should be
        new_duration = librosa.get_duration(new_sample, float(new_sr))

        assert new_duration == orig_duration
        # File name w/o extension:
        sample_file_stem = Path(file_name).stem
        noise_file_stem = Path(background_name).stem
        noise_dur = str(int(noise_start_loc / new_sr * 1000))
        file_name = f"{sample_file_stem}-{noise_file_stem}_bgd{noise_dur}ms.wav"

        # Ensure that the fname doesn't exist:
        uniq_fname = Utils.unique_fname(out_dir, file_name)
        out_path = os.path.join(out_dir, uniq_fname)

        soundfile.write(out_path, new_sample, new_sr)
        return out_path
Пример #29
0
    def test_from_commandline(self):
        with tempfile.TemporaryDirectory(dir='/tmp', 
                                         prefix='test_spectro') as dst_dir:
            
            args = Arguments()
            args.input   = self.spectro_root
            args.outdir  = dst_dir
            args.workers = None
            
            # Number of spectrogram .png files
            # in source tree:
            spectros_to_chop = Utils.find_in_dir_tree(self.spectro_root, '*.png')

            manager = mp.Manager()
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()

            # ------ Chop spectrograms:
            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.OVERWRITE
                )
                
            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]
            
            num_spectros_done = sum([len(Utils.find_in_dir_tree(one_filled_dir, '*.png'))
                                     for one_filled_dir
                                     in dirs_filled])

            self.assertTrue(num_spectros_done > len(spectros_to_chop))
            
            self.check_spectro_sanity(dirs_filled)
            
            # Remember the creation times:
            file_times = self.record_creation_times(dirs_filled)

            # ------ SKIP the existing spectrograms:
            # Run again, asking to skip already existing
            # spectros:
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()
            
            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.SKIP
                )

            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]

            # Mod times of png files must NOT have changed,
            # b/c of skipping
            new_file_times = self.record_creation_times(dirs_filled)
            self.assertDictEqual(new_file_times, file_times)
            
            # ------ Force RECREATION of spectrograms:
            # Run again with OVERWRITE, forcing the 
            # spectros to be done again:
            global_info = manager.dict()
            global_info['jobs_status'] = manager.list()

            SpectrogramChopper.run_workers(
                args,
                global_info,
                overwrite_policy=WhenAlreadyDone.OVERWRITE
                )
                
            dirs_filled = [os.path.join(dst_dir, species_dir) 
                           for species_dir 
                           in os.listdir(dst_dir)]
                           
            self.check_spectro_sanity(dirs_filled)
            
            # File times must be *different* from previous
            # run because we asked to overwrite:

            new_file_times = self.record_creation_times(dirs_filled)
            for fname in file_times.keys():
                try:
                    self.assertTrue(new_file_times[fname] != file_times[fname])
                except KeyError as e:
                    print(repr(e))
Пример #30
0
    # Enforce args to set_info or add_info being
    # equal length, i.e. having 'names' and 'values'
    # as pairs:
    if (setting and len(info_to_set) % 2 != 0) \
       or (adding and len(info_to_add) % 2 != 0):
        print(
            "Info entries must be pairs of keys and values; length is odd numbered here"
        )
        sys.exit(1)

    # Safety precaution just for setting
    # (and thereby overwriting) metadata:

    if setting and not args.force:
        if not Utils.user_confirm(
                "Really want to overwrite png file metadata? (N/y)",
                default='n'):
            print("Canceling")
            sys.exit(0)

    if args.printout:
        print("Metadata before:")
        PNGMetadataManipulator.extract_metadata(args.snippet_src,
                                                show=args.show,
                                                printout=args.printout)
        print("")

    # Setting info_to_set:
    if args.outfile is None:
        # Overwrite the input file,
        # i.e. add metadata in place: