Exemplo n.º 1
0
    def setUp(self):
        self.spect_params = dict(fft_size=512,
                                 step_size=64,
                                 freq_cutoffs=(500, 10000),
                                 thresh=6.25,
                                 transform_type='log_spect')

        self.tmp_output_dir = tempfile.mkdtemp()

        # ---- cbins -------------------------------
        self.audio_dir_cbin = TEST_DATA_DIR.joinpath('cbins', 'gy6or6',
                                                     '032312')
        self.audio_files_cbin = sorted(list(
            self.audio_dir_cbin.glob('*.cbin')))
        self.audio_files_cbin = [str(path) for path in self.audio_files_cbin]

        self.annot_files_cbin = files_from_dir(annot_dir=self.audio_dir_cbin,
                                               annot_format='notmat')
        scribe_cbin = crowsetta.Transcriber(annot_format='notmat')
        self.annot_list_cbin = scribe_cbin.from_file(
            annot_file=self.annot_files_cbin)

        self.labelset_cbin = set(list('iabcdefghjk'))

        # sort annotation, audio into lists so we can verify labelset works
        # "good" = all labels in annotation are in labelset
        self.good = [(annot_file, Path(annot.audio_file).name) for annot_file,
                     annot in zip(self.annot_files_cbin, self.annot_list_cbin)
                     if set(annot.seq.labels).issubset(self.labelset_cbin)]

        # "bad" = has labels not in labelset
        self.bad = [(annot_file, Path(annot.audio_file).name) for annot_file,
                    annot in zip(self.annot_files_cbin, self.annot_list_cbin)
                    if not set(annot.seq.labels).issubset(self.labelset_cbin)]
Exemplo n.º 2
0
    def setUp(self):
        self.spect_dir = TEST_DATA_DIR.joinpath('mat', 'llb3', 'spect')
        self.spect_files = self.spect_dir.glob('*.mat')
        self.spect_files = sorted([str(path) for path in self.spect_files])
        self.spect_format = 'mat'

        self.annot_mat = TEST_DATA_DIR.joinpath('mat', 'llb3',
                                                'llb3_annot_subset.mat')
        self.annot_mat = str(self.annot_mat)
        self.scribe = crowsetta.Transcriber(annot_format='yarden')
        self.annot_list = self.scribe.from_file(self.annot_mat)
        self.labelset_mat = {
            1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19
        }
Exemplo n.º 3
0
    def test_lbl_tb2segments_recovers_onsets_offsets_labels_from_real_data(
            self):
        # TODO: make all this into fixture(s?) when switching to PyTest
        scribe = crowsetta.Transcriber(annot_format='notmat')
        annot_list = scribe.from_file(annot_file=ANNOT_PATHS)
        annot_list = [
            annot for annot in annot_list
            # need to remove any annotations that have labels not in labelset
            if not any(lbl not in LABELMAP.keys() for lbl in annot.seq.labels)
        ]
        spect_annot_map = vak.annotation.source_annot_map(
            SPECT_PATHS,
            annot_list,
        )

        lbl_tb_list = []
        for spect_file, annot in spect_annot_map.items():
            lbls_int = [LABELMAP[lbl] for lbl in annot.seq.labels]
            time_bins = vak.files.spect.load(spect_file)[TIMEBINS_KEY]
            lbl_tb_list.append(
                vak.labeled_timebins.label_timebins(
                    lbls_int,
                    annot.seq.onsets_s,
                    annot.seq.offsets_s,
                    time_bins,
                    unlabeled_label=LABELMAP['unlabeled']))

        for lbl_tb, annot in zip(lbl_tb_list, spect_annot_map.values()):
            labels, onsets_s, offsets_s = vak.labeled_timebins.lbl_tb2segments(
                lbl_tb, LABELMAP, TIMEBIN_DUR)

            self.assertTrue(np.array_equal(labels, annot.seq.labels))
            self.assertTrue(
                np.allclose(onsets_s,
                            annot.seq.onsets_s,
                            atol=0.001,
                            rtol=0.03))
            self.assertTrue(
                np.allclose(offsets_s,
                            annot.seq.offsets_s,
                            atol=0.001,
                            rtol=0.03))
Exemplo n.º 4
0
    def test_source_annot_map_cbin_yarden(self):
        scribe = crowsetta.Transcriber(annot_format='yarden')
        mat_dir = TEST_DATA_DIR.joinpath('mat', 'llb3')
        annot_file = str(mat_dir.joinpath('llb3_annot_subset.mat'))
        annot_list = scribe.from_file(annot_file=annot_file)

        spect_files = mat_dir.joinpath('spect').glob('*.mat')
        spect_files = [str(path) for path in spect_files]

        source_annot_map = vak.annotation.source_annot_map(
            source_files=spect_files, annot_list=annot_list)

        for source, annot in list(source_annot_map.items()):
            self.assertTrue(source in spect_files)
            self.assertTrue(annot in annot_list)
            source_annot_map.pop(source)

        # if every source file got mapped to an annot, and we mapped all of them,
        # then dictionary should be empty after loop
        self.assertTrue(source_annot_map == {})
Exemplo n.º 5
0
    def test_source_annot_map_cbin_notmat(self):
        scribe = crowsetta.Transcriber(annot_format='notmat')
        cbin_dir = TEST_DATA_DIR.joinpath('cbins', 'gy6or6', '032312')
        notmats = cbin_dir.glob('*.not.mat')
        notmats = [str(path) for path in notmats]
        annot_list = scribe.from_file(annot_file=notmats)

        audio_files = cbin_dir.glob('*.cbin')
        audio_files = [str(path) for path in audio_files]
        source_annot_map = vak.annotation.source_annot_map(
            source_files=audio_files, annot_list=annot_list)

        for source, annot in list(source_annot_map.items()):
            self.assertTrue(source in audio_files)
            self.assertTrue(annot in annot_list)
            source_annot_map.pop(source)

        # if every source file got mapped to an annot, and we mapped all of them,
        # then dictionary should be empty after loop
        self.assertTrue(source_annot_map == {})
Exemplo n.º 6
0
    def test_source_annot_map_wav_koumura(self):
        scribe = crowsetta.Transcriber(annot_format='koumura')
        koumura_dir = TEST_DATA_DIR.joinpath('koumura', 'Bird0')
        annot_xml = str(koumura_dir.joinpath('Annotation.xml'))
        wavpath = koumura_dir.joinpath('Wave')
        annot_list = scribe.from_file(annot_file=annot_xml,
                                      wavpath=str(wavpath))
        audio_files = wavpath.glob('*.wav')
        audio_files = [str(path) for path in audio_files]
        source_annot_map = vak.annotation.source_annot_map(
            source_files=audio_files, annot_list=annot_list)

        for source, annot in list(source_annot_map.items()):
            self.assertTrue(source in audio_files)
            self.assertTrue(annot in annot_list)
            source_annot_map.pop(source)

        # if every source file got mapped to an annot, and we mapped all of them,
        # then dictionary should be empty after loop
        self.assertTrue(source_annot_map == {})
Exemplo n.º 7
0
def main(train_dur=TRAIN_DUR,
         val_dur=VAL_DUR,
         annot_ext=ANNOT_EXT,
         annot_format=ANNOT_FORMAT,
         subset_dir=SUBSET_DIR,
         labelset=None):
    """makes training set of specified duration by taking subset of files
    in current directory and copying to a newly-created sub-directory
    """
    labelset = list(labelset)  # assumes single string
    annot_files = glob(f'*{annot_ext}')
    annot_files = sorted(annot_files)
    scribe = crowsetta.Transcriber(annot_format=annot_format)
    annots = scribe.from_file(annot_files)

    dur = 0
    annot_ctr = 0
    annots_to_use = []
    total_dur = train_dur + val_dur
    while dur < total_dur:
        if annot_ctr > len(annots):
            raise ValueError(
                f'ran out of annotation files before finding subset of duration {total_dur}'
            )

        if labelset:
            if not set(annots[annot_ctr].seq.labels).issubset(set(labelset)):
                annot_ctr += 1
                continue

        dur += annots[annot_ctr].seq.offsets_s[-1]
        annots_to_use.append(annots[annot_ctr])
        annot_ctr += 1

    os.makedirs(subset_dir)

    for annot in annots_to_use:
        annot_stem = Path(annot.annot_file).name.split('.')[0]
        files_this_annot = glob(f'{annot_stem}*')
        for file in files_this_annot:
            shutil.move(file, subset_dir)
Exemplo n.º 8
0
def main(train_dur=TRAIN_DUR,
         val_dur=VAL_DUR,
         annot_ext=ANNOT_EXT,
         voc_format=VOC_FORMAT,
         subset_dir=SUBSET_DIR,
         labelset=None):
    """makes training set of specified duration by taking subset of files
    in current directory and copying to a newly-created sub-directory
    """
    labelset = list(labelset)  # assumes single string
    annot_files = glob(f'*{annot_ext}')
    annot_files = sorted(annot_files)
    scribe = crowsetta.Transcriber(voc_format=voc_format)
    seqs = scribe.to_seq(annot_files)

    dur = 0
    seq_ctr = 0
    seqs_to_use = []
    total_dur = train_dur + val_dur
    while dur < total_dur:
        if seq_ctr > len(seqs):
            raise ValueError(
                f'ran out of annotation files before finding subset of duration {total_dur}'
            )

        if labelset:
            if not set(seqs[seq_ctr].labels).issubset(set(labelset)):
                seq_ctr += 1
                continue

        dur += seqs[seq_ctr].offsets_s[-1]
        seqs_to_use.append(seqs[seq_ctr])
        seq_ctr += 1

    os.makedirs(subset_dir)

    for seq in seqs_to_use:
        seq_stem = Path(seq.file).stem
        seq_files = glob(f'{seq_stem}*')
        for seq_file in seq_files:
            shutil.move(seq_file, subset_dir)
Exemplo n.º 9
0
def from_df(vak_df):
    """get list of annotations from a vak DataFrame.
    If no annotation format is specified for the DataFrame
    (in the 'annot_format' column), returns None.

    Parameters
    ----------
    vak_df : DataFrame
        representating a dataset of vocalizations, with column 'annot_format'.

    Returns
    -------
    annots : list
        of annotations for each row in the dataframe,
        represented as crowsetta.Annotation instances.

    Notes
    -----
    This function encapsulates logic for handling different types of
    annotations; it determines whether each row has a separate annotation file,
    or if instead there is a single annotation file associated with all rows.
    If the latter, then the function opens that file and makes sure that
    each row from the dataframe can be paired with an annotation (using `source_annot_map`).
    """
    annot_format = format_from_df(vak_df)
    if annot_format is None:
        return None

    scribe = crowsetta.Transcriber(annot_format=annot_format)

    if len(vak_df['annot_path'].unique()) == 1:
        # --> there is a single annotation file associated with all rows
        # this can be true in two different cases:
        # (1) many rows, all have the same file
        # (2) only one row, so there's only one annotation file (which may contain annotation for multiple source files)
        annot_path = vak_df['annot_path'].unique().item()
        annots = scribe.from_file(annot_file=annot_path)

        # as long as we have at least as many annotations as there are rows in the dataframe
        if ((isinstance(annots, list) and len(annots) >= len(vak_df))
                or  # case 1
            (isinstance(annots, crowsetta.Annotation)
             and len(vak_df) == 1)):  # case 2
            if isinstance(annots, crowsetta.Annotation):
                annots = [
                    annots
                ]  # wrap in list for source_annot_map to iterate over it
            # then we can try and map those annotations to the rows
            audio_annot_map = source_annot_map(vak_df['audio_path'].values,
                                               annots)
            # sort by row of dataframe
            annots = [
                audio_annot_map[audio_path]
                for audio_path in vak_df['audio_path'].values
            ]

        else:
            raise ValueError(
                'unable to load labels from dataframe; found a single annotation file associated with all '
                'rows in dataframe, but loading it did not return a list of annotations for each row.\n'
                f'Single annotation file: {annot_path}\n'
                f'Loading it returned a {type(annots)}.')

    elif len(vak_df['annot_path'].unique()) == len(vak_df):
        # --> there is a unique annotation file (path) for each row, iterate over them to get labels from each
        annots = [
            scribe.from_file(annot_file=annot_path)
            for annot_path in vak_df['annot_path'].values
        ]

    else:
        raise ValueError(
            'unable to load labels from dataframe; did not find an annotation file for each row or '
            'a single annotation file associated with all rows.')

    return annots
Exemplo n.º 10
0
def predict(
    csv_path,
    checkpoint_path,
    labelmap_path,
    annot_format,
    to_format_kwargs,
    model_config_map,
    window_size,
    num_workers=2,
    spect_key='s',
    timebins_key='t',
    spect_scaler_path=None,
    device=None,
    logger=None,
):
    """make predictions on dataset with trained model specified in config.toml file.
    Function called by command-line interface.

    Parameters
    ----------
    csv_path : str
        path to where dataset was saved as a csv.
    checkpoint_path : str
        path to directory with checkpoint files saved by Torch, to reload model
    labelmap_path : str
        path to 'labelmap.json' file.
    annot_format : str
        format of annotations. Any format that can be used with the
        crowsetta library is valid.
    to_format_kwargs : dict
        keyword arguments for crowsetta `to_format` function.
        Defined in .toml config file as a table.
        An example for the notmat annotation format (as a dictionary) is:
        {'min_syl_dur': 10., 'min_silent_dur', 6., 'threshold': 1500}.
    model_config_map : dict
        where each key-value pair is model name : dict of config parameters
    window_size : int
        size of windows taken from spectrograms, in number of time bins,
        shown to neural networks
    num_workers : int
        Number of processes to use for parallel loading of data.
        Argument to torch.DataLoader. Default is 2.
    spect_key : str
        key for accessing spectrogram in files. Default is 's'.
    timebins_key : str
        key for accessing vector of time bins in files. Default is 't'.
    device : str
        Device on which to work with model + data.
        Defaults to 'cuda' if torch.cuda.is_available is True.
    spect_scaler_path : str
        path to a saved SpectScaler object used to normalize spectrograms.
        If spectrograms were normalized and this is not provided, will give
        incorrect results.

    Other Parameters
    ----------------
    logger : logging.Logger
        instance created by vak.logging.get_logger. Default is None.

    Returns
    -------
    None
    """
    if device is None:
        device = get_default_device()

    # ---------------- load data for prediction ------------------------------------------------------------------------
    if spect_scaler_path:
        log_or_print(f'loading SpectScaler from path: {spect_scaler_path}',
                     logger=logger,
                     level='info')
        spect_standardizer = joblib.load(spect_scaler_path)
    else:
        log_or_print(f'Not loading SpectScaler, no path was specified',
                     logger=logger,
                     level='info')
        spect_standardizer = None

    transform, target_transform = transforms.get_defaults(
        'predict',
        spect_standardizer,
        window_size=window_size,
        return_padding_mask=False,
    )

    log_or_print(f'loading dataset to predict from csv path: {csv_path}',
                 logger=logger,
                 level='info')
    pred_dataset = UnannotatedDataset.from_csv(
        csv_path=csv_path,
        split='predict',
        window_size=window_size,
        spect_key=spect_key,
        timebins_key=timebins_key,
        transform=transform,
    )

    pred_data = torch.utils.data.DataLoader(
        dataset=pred_dataset,
        shuffle=False,
        batch_size=1,  # hard coding to make this work for now
        num_workers=num_workers)

    # ---------------- set up to convert predictions to annotation files -----------------------------------------------
    log_or_print(
        f'will convert predictions to specified annotation format: {annot_format}',
        logger=logger,
        level='info')
    log_or_print(
        f'will use following settings for converting to annotation format: {to_format_kwargs}',
        logger=logger,
        level='info')
    scribe = crowsetta.Transcriber(annot_format=annot_format)
    log_or_print(f'loading labelmap from path: {labelmap_path}',
                 logger=logger,
                 level='info')
    with labelmap_path.open('r') as f:
        labelmap = json.load(f)

    dataset_df = pd.read_csv(csv_path)
    timebin_dur = io.dataframe.validate_and_get_timebin_dur(dataset_df)
    log_or_print(f'dataset has timebins with duration: {timebin_dur}',
                 logger=logger,
                 level='info')
    # ---------------- do the actual predicting + converting to annotations --------------------------------------------
    input_shape = pred_dataset.shape
    # if dataset returns spectrogram reshaped into windows,
    # throw out the window dimension; just want to tell network (channels, height, width) shape
    if len(input_shape) == 4:
        input_shape = input_shape[1:]
    log_or_print(
        f'shape of input to networks used for predictions: {input_shape}',
        logger=logger,
        level='info')

    log_or_print(
        f'instantiating models from model-config map:/n{model_config_map}',
        logger=logger,
        level='info')
    models_map = models.from_model_config_map(model_config_map,
                                              num_classes=len(labelmap),
                                              input_shape=input_shape)
    for model_name, model in models_map.items():
        # ---------------- do the actual predicting --------------------------------------------------------------------
        log_or_print(
            f'loading checkpoint for {model_name} from path: {checkpoint_path}',
            logger=logger,
            level='info')
        model.load(checkpoint_path)
        log_or_print(f'running predict method of {model_name}',
                     logger=logger,
                     level='info')
        pred_dict = model.predict(pred_data=pred_data, device=device)

        # ----------------  converting to annotations ------------------------------------------------------------------
        # note use no transforms
        dataset_for_annot = UnannotatedDataset.from_csv(
            csv_path=csv_path,
            split='predict',
            window_size=window_size,
            spect_key=spect_key,
            timebins_key=timebins_key,
        )

        data_for_annot = torch.utils.data.DataLoader(dataset=dataset_for_annot,
                                                     shuffle=False,
                                                     batch_size=1,
                                                     num_workers=num_workers)

        # use transform "outside" of Dataset so we can get back crop vec
        pad_to_window = transforms.PadToWindow(window_size,
                                               return_padding_mask=True)

        progress_bar = tqdm(data_for_annot)

        log_or_print('converting predictions to annotation files',
                     logger=logger,
                     level='info')
        for ind, batch in enumerate(progress_bar):
            x, y = batch[0], batch[
                1]  # here we don't care about putting on some device outside cpu
            if len(x.shape) == 3:  # ("batch", freq_bins, time_bins)
                x = x.cpu().numpy().squeeze()
            x_pad, padding_mask = pad_to_window(x)
            y_pred_ind = pred_dict['y'].index(y)
            y_pred = pred_dict['y_pred'][y_pred_ind]
            y_pred = torch.argmax(y_pred,
                                  dim=1)  # assumes class dimension is 1
            y_pred = torch.flatten(y_pred).cpu().numpy()[padding_mask]
            labels, onsets_s, offsets_s = labelfuncs.lbl_tb2segments(
                y_pred, labelmap=labelmap, timebin_dur=timebin_dur)
            # DataLoader wraps strings in a tuple, need to unpack
            if type(y) == tuple and len(y) == 1:
                y = y[0]
            audio_fname = files.spect.find_audio_fname(y)
            audio_filename = Path(y).parent.joinpath(audio_fname)
            audio_filename = str(
                audio_filename)  # in case function doesn't accept Path
            scribe.to_format(labels=labels,
                             onsets_s=onsets_s,
                             offsets_s=offsets_s,
                             filename=audio_filename,
                             **to_format_kwargs)
Exemplo n.º 11
0
HERE = os.path.dirname(__file__)f
TEST_DATA_DIR = os.path.join(HERE,
                             '..',
                             '..',
                             'test_data')
SETUP_SCRIPTS_DIR = os.path.join(HERE,
                                 '..',
                                 '..',
                                 'setup_scripts')

NUM_SAMPLES = 10  # number of times to sample behavior of random-number generator

audio_dir_cbin = os.path.join(TEST_DATA_DIR, 'cbins', 'gy6or6', '032312')
audio_files_cbin = glob(os.path.join(audio_dir_cbin, '*.cbin'))
annot_files_cbin = files_from_dir(annot_dir=audio_dir_cbin, annot_format='notmat')
scribe_cbin = crowsetta.Transcriber(annot_format='notmat')
annot_list_cbin = scribe_cbin.from_file(annot_file=annot_files_cbin)
labelset_cbin = set(list('iabcdefghjk'))
durs_cbin = []
labels_cbin = []
for audio_file, annot in zip(audio_files_cbin, annot_list_cbin):
    if set(annot.seq.labels).issubset(labelset_cbin):
        labels_cbin.append(annot.seq.labels)
        fs, data = load_cbin(audio_file)
        durs_cbin.append(data.shape[0] / fs)

spect_dir_mat = os.path.join(TEST_DATA_DIR, 'mat', 'llb3', 'spect')
spect_files_mat = glob(os.path.join(spect_dir_mat, '*.mat'))
annot_mat = os.path.join(TEST_DATA_DIR, 'mat', 'llb3', 'llb3_annot_subset.mat')
scribe_yarden = crowsetta.Transcriber(annot_format='yarden')
annot_list_mat = scribe_yarden.from_file(annot_mat)
Exemplo n.º 12
0
    def _check_output(self,
                      data_dir,
                      labelset,
                      audio_format,
                      spect_format,
                      annot_format,
                      annot_file,
                      vds_paths,
                      num_expected_paths,
                      splits=None,
                      specd_durs=None):
        self.assertTrue(len(vds_paths) == num_expected_paths)

        # check that all files from data_dir that should've gone into dataset
        # actually made it into dataset
        if audio_format:
            data_files_from_dir = vak.io.audio.files_from_dir(
                data_dir, audio_format)
        elif spect_format:
            data_files_from_dir = vak.files.files.from_dir(
                data_dir, spect_format)

        if num_expected_paths == 1:
            vds = Dataset.from_json(json_fname=vds_paths[0])
            if audio_format:
                data_files_in_vds = [voc.audio_path for voc in vds.voc_list]
            elif spect_format:
                data_files_in_vds = [voc.spect_path for voc in vds.voc_list]

            if labelset is None:
                self.assertTrue(data_files_from_dir == data_files_in_vds)
            else:
                scribe = crowsetta.Transcriber(voc_format=annot_format)
                if annot_file:
                    annot_list = scribe.to_seq(file=annot_file)
                else:
                    annot_files = vak.annotation.files_from_dir(
                        annot_dir=data_dir, annot_format=annot_format)
                    annot_list = scribe.to_seq(file=annot_files)
                for data_file, annot in zip(data_files_from_dir, annot_list):
                    if set(annot.labels).issubset(labelset):
                        self.assertTrue(data_file in data_files_in_vds)
                    else:
                        self.assertTrue(data_file not in data_files_in_vds)

        # if we split the dataset, make sure the split worked
        if splits and specd_durs:
            for split, specd_dur in zip(splits, specd_durs):
                path = [path for path in vds_paths if split in path]
                self.assertTrue(len(path) == 1)
                path = path[0]
                if specd_dur > 0:
                    vds_loaded = Dataset.from_json(json_fname=path)
                    total_dur = sum(
                        [voc.duration for voc in vds_loaded.voc_list])
                    self.assertTrue((total_dur >= specd_dur))

                elif specd_dur == -1:
                    vds_loaded = Dataset.from_json(json_fname=path)
                    total_dur = sum(
                        [voc.duration for voc in vds_loaded.voc_list])
                    source_vds_path = [
                        path for path in vds_paths
                        if 'test' not in path and 'train' not in path
                    ][0]
                    source_vds = Dataset.from_json(json_fname=source_vds_path)
                    source_dur = sum(
                        [voc.duration for voc in source_vds.voc_list])

                    if split == 'train':
                        test_path = [
                            path for path in vds_paths if 'test' in path
                        ][0]
                        test_vds = Dataset.from_json(json_fname=test_path)
                        test_dur = sum(
                            [voc.duration for voc in test_vds.voc_list])
                        self.assertTrue(
                            isclose(total_dur, source_dur - test_dur))
                    elif split == 'test':
                        train_path = [
                            path for path in vds_paths if 'train' in path
                        ][0]
                        train_vds = Dataset.from_json(json_fname=train_path)
                        train_dur = sum(
                            [voc.duration for voc in train_vds.voc_list])
                        self.assertTrue(
                            isclose(total_dur, source_dur - train_dur))

        return True
Exemplo n.º 13
0
def annot_list_koumura(annot_file_koumura):
    scribe = crowsetta.Transcriber(format="koumura")
    annot_list = scribe.from_file(annot_file_koumura)
    return annot_list
Exemplo n.º 14
0
def annot_list_notmat(annot_files_notmat):
    scribe = crowsetta.Transcriber(format="notmat")
    annot_list = scribe.from_file(annot_files_notmat)
    return annot_list
Exemplo n.º 15
0
def annot_list_yarden(annot_file_yarden):
    scribe = crowsetta.Transcriber(format="yarden")
    annot_list = scribe.from_file(annot_file_yarden)
    return annot_list