Exemplo n.º 1
0
def test_BufferedStreamer(dimension, batch_size, buf_size):

    key = 'X'

    def __unpack_stream(stream):
        for data in stream:
            for item in data[key]:
                yield item

    reference = T.md_generator(dimension, 50, size=batch_size)

    reference = [data[key] for data in reference]

    gen_stream = pescador.Streamer(T.md_generator,
                                   dimension,
                                   50,
                                   size=batch_size)
    estimate = pescador.BufferedStreamer(gen_stream, buf_size)

    estimate = list(__unpack_stream(estimate))

    T.__eq_lists(reference, estimate)

    estimate = pescador.BufferedStreamer(gen_stream, buf_size)

    assert len(list(estimate.iterate(max_iter=2))) <= 2
Exemplo n.º 2
0
def data_generator(working,
                   tracks,
                   sampler,
                   k,
                   threshold,
                   augment=True,
                   batch_size=32,
                   **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tqdm(tracks):
        fname = os.path.join(working, os.path.extsep.join([str(track), 'h5']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler,
                                       threshold))

        if augment:
            for augname in sorted(
                    glob(os.path.join(working, '{}.*.h5'.format(track)))):
                seeds.append(
                    pescador.Streamer(data_sampler, augname, sampler,
                                      threshold))

    # Send it all to a mux
    mux = pescador.Mux(seeds, k, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemplo n.º 3
0
def data_generator(working, tracks, sampler, k, batch_size=32,
                   augmentation=False, weights=None, **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []
    pool_weights = []

    for track in tracks:
        fname = os.path.join(working, 'pump',
                             os.path.extsep.join([track, 'npz']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler))

        if weights is not None:
            pool_weights.append(weights.loc[track])

        if augmentation:
            for fname in sorted(glob(os.path.join(working, 'pump',
                                                  '{}.*.npz'.format(track)))):
                seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                if weights is not None:
                    pool_weights.append(weights.loc[track])

    # Send it all to a mux
    if not pool_weights:
        pool_weights = None

    mux = pescador.Mux(seeds, k, pool_weights=pool_weights, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemplo n.º 4
0
def data_generator(working,
                   tracks,
                   sampler,
                   k,
                   augment=True,
                   augment_drc=True,
                   batch_size=32,
                   **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tqdm(tracks):
        fname = os.path.join(working, os.path.extsep.join([str(track), 'h5']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler))

        if augment:
            for aug in range(4):
                augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                # seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                seeds.append(pescador.Streamer(data_sampler, augname, sampler))

    # Send it all to a mux
    mux = pescador.Mux(seeds, k, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemplo n.º 5
0
def test_BufferedStreamer_tuples(dimension, batch_size, buf_size, items):

    gen_stream = pescador.Streamer(T.md_generator,
                                   dimension,
                                   50,
                                   size=batch_size,
                                   items=items)

    buf = pescador.BufferedStreamer(gen_stream, buf_size)
    estimate = list(buf.tuples(*items))
    reference = list(buf)

    for b, t in zip(reference, estimate):
        assert isinstance(t, tuple)
        assert len(t) == len(items)
        for item, ti in zip(items, t):
            assert np.allclose(b[item], ti)
Exemplo n.º 6
0
def keras_generator(data_list, input_patch_size):
    """Generator to be passed to a keras model
    """
    streams = []
    for fpath_in, fpath_out in data_list:
        streams.append(
            pescador.Streamer(patch_generator,
                              fpath_in,
                              fpath_out,
                              input_patch_size=input_patch_size))

    stream_mux = pescador.Mux(streams,
                              10,
                              with_replacement=True,
                              lam=500,
                              random_state=RANDOM_STATE)

    batch_generator = pescador.BufferedStreamer(stream_mux, 16)

    for batch in batch_generator.tuples('X', 'Y'):
        yield batch
Exemplo n.º 7
0
def data_generator_balanced(working,
                            tracks,
                            sampler,
                            k,
                            augment=True,
                            augment_drc=True,
                            batch_size=32,
                            **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    labelfile = (
        '/beegfs/js7561/datasets/dcase2017/task4_official/combined/metadata/'
        'labels/groundtruth_weak_label_training_set.csv')
    labels = pd.read_csv(labelfile, header=None, sep='\t')
    labels.columns = ['filename', 'start_time', 'end_time', 'label']

    muxes = []

    for l in DCASE_CLASSES:
        lclass = labels[labels.label == l]
        filenames = lclass.filename.values
        filenames = [('Y{}'.format(fn)).replace('.wav', '')
                     for fn in filenames]
        tracks_str = [str(t) for t in tracks]
        filenames = np.intersect1d(np.array(tracks_str), np.array(filenames))
        print(l, len(filenames))

        seeds = []

        for track in tqdm(filenames):
            fname = os.path.join(working,
                                 os.path.extsep.join([str(track), 'h5']))
            seeds.append(pescador.Streamer(data_sampler, fname, sampler))

            if augment:
                # for fname in sorted(glob(os.path.join(working,
                #                                       '{}.*.h5'.format(track)))):
                for aug in range(10):
                    augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                    # seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                    seeds.append(
                        pescador.Streamer(data_sampler, augname, sampler))

            if augment_drc:
                for aug in range(10, 14):
                    augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                    seeds.append(
                        pescador.Streamer(data_sampler, augname, sampler))

        # Send it all to a mux
        n_active = k // len(DCASE_CLASSES)
        mux = pescador.Mux(seeds, n_active, **kwargs)
        # Add mux to list
        muxes.append(mux)

    # Create mux from muxes
    supermux = pescador.Mux(muxes,
                            len(muxes),
                            lam=None,
                            pool_weights=None,
                            with_replacement=True)

    if batch_size == 1:
        return supermux
    else:
        return pescador.BufferedStreamer(supermux, batch_size)
Exemplo n.º 8
0
def multiplex_lms_with_background(aug_kind_str, fold_units, n_input_hops,
                                  batch_size):

    # Define constants.
    aug_dict = localmodule.get_augmentations()
    data_dir = localmodule.get_data_dir()
    dataset_name = localmodule.get_dataset_name()
    tfr_name = "_".join([dataset_name, "clip-logmelspec"])
    tfr_dir = os.path.join(data_dir, tfr_name)
    bg_name = "_".join([dataset_name, "clip-logmelspec-backgrounds"])
    bg_dir = os.path.join(data_dir, bg_name)
    T_str = "T-" + str(bg_duration).zfill(4)
    T_dir = os.path.join(bg_dir, T_str)

    # Parse augmentation kind string (aug_kind_str).
    if aug_kind_str == "none":
        augs = ["original"]
    elif aug_kind_str == "pitch":
        augs = ["original", "pitch"]
    elif aug_kind_str == "stretch":
        augs = ["original", "stretch"]
    elif aug_kind_str == "all-but-noise":
        augs = ["original", "pitch", "stretch"]
    else:
        noise_augs = ["noise-" + unit_str for unit_str in fold_units]
        if aug_kind_str == "all":
            augs = noise_augs + ["original", "pitch", "stretch"]
        elif aug_kind_str == "noise":
            augs = noise_augs + ["original"]

    # Loop over augmentations.
    streams = []
    for aug_str in augs:

        # Define instances.
        aug_dir = os.path.join(tfr_dir, aug_str)
        if aug_str == "original":
            instances = [aug_str]
        else:
            n_instances = aug_dict[aug_str]
            instances = [
                "-".join([aug_str, str(instance_id)])
                for instance_id in range(n_instances)
            ]

        # Define bias.
        if aug_str[:5] == "noise":
            bias = np.float32(-17.0)
        else:
            bias = np.float32(0.0)

        # Loop over instances.
        for instanced_aug_str in instances:

            # Loop over units.
            for unit_str in fold_units:

                # Define path to time-frequency representation.
                lms_name = "_".join(
                    [dataset_name, instanced_aug_str, unit_str])
                lms_path = os.path.join(aug_dir, lms_name + ".hdf5")

                # Define path to background.
                bg_name = "_".join([
                    dataset_name, "background_summaries", unit_str,
                    T_str + ".hdf5"
                ])
                bg_path = os.path.join(T_dir, bg_name)

                # Define pescador streamer.
                stream = pescador.Streamer(yield_lms_and_background, lms_path,
                                           n_input_hops, bias, bg_path)
                streams.append(stream)

    # Multiplex streamers together.
    mux = pescador.Mux(streams,
                       k=len(streams),
                       lam=None,
                       with_replacement=True,
                       revive=True)

    # Create buffered streamer with specified batch size.
    buffered_streamer = pescador.BufferedStreamer(mux, batch_size)

    return pescador.maps.keras_tuples(buffered_streamer,
                                      inputs=["X_spec", "X_bg"],
                                      outputs=["y"])
Exemplo n.º 9
0
def multiplex_tfr(aug_kind_str,
                  fold_units,
                  n_hops,
                  batch_size,
                  tfr_str="logmelspec"):
    # Parse augmentation kind string (aug_kind_str).
    if aug_kind_str == "none":
        augs = ["original"]
    elif aug_kind_str == "pitch":
        augs = ["original", "pitch"]
    elif aug_kind_str == "stretch":
        augs = ["original", "stretch"]
    elif aug_kind_str == "all-but-noise":
        augs = ["original", "pitch", "stretch"]
    else:
        noise_augs = ["noise-" + unit_str for unit_str in fold_units]
        if aug_kind_str == "all":
            augs = noise_augs + ["original", "pitch", "stretch"]
        elif aug_kind_str == "noise":
            augs = noise_augs + ["original"]

    # Generate a Pescador streamer for every HDF5 container, that is,
    # every unit-augmentation-instance triplet.
    aug_dict = get_augmentations()
    data_dir = get_data_dir()
    dataset_name = get_dataset_name()
    tfr_name = "_".join([dataset_name, "clip-" + tfr_str])
    tfr_dir = os.path.join(data_dir, tfr_name)
    streams = []
    for aug_str in augs:
        aug_dir = os.path.join(tfr_dir, aug_str)
        if aug_str == "original":
            instances = [aug_str]
        else:
            n_instances = aug_dict[aug_str]
            instances = [
                "-".join([aug_str, str(instance_id)])
                for instance_id in range(n_instances)
            ]
        if aug_str[:5] == "noise" and tfr_str == "logmelspec":
            bias = np.float32(-17.0)
        else:
            bias = np.float32(0.0)
        for instanced_aug_str in instances:
            for unit_str in fold_units:
                lms_name = "_".join(
                    [dataset_name, instanced_aug_str, unit_str])
                lms_path = os.path.join(aug_dir, lms_name + ".hdf5")
                stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias,
                                           tfr_str)
                streams.append(stream)

    # Multiplex streamers together.
    mux = pescador.Mux(streams,
                       k=len(streams),
                       lam=None,
                       with_replacement=True,
                       revive=True)

    # Create buffered streamer with specified batch size.
    buffered_streamer = pescador.BufferedStreamer(mux, batch_size)

    return buffered_streamer.tuples("X", "y", cycle=True)