Пример #1
0
def get_waveform_transforms(transforms: DictConfig, device=None):
    """
    get all necessary transforms from config
    :param transforms: transforms from config
    :param device: device if transforms need to be ported to device
    :return: transforms composed into aud.Compose
    """
    if transforms is None:
        return None
    if device is not None:
        return aud.Compose([
            hydra.utils.instantiate(transform).to(device)
            for transform in transforms
        ])
    else:
        return aud.Compose(
            [hydra.utils.instantiate(transform) for transform in transforms])
Пример #2
0
def get_transforms(transforms: DictConfig):
    """
    get all necessary transforms from config
    :param transforms: transforms from config
    :return: transforms composed into aud.Compose
    """
    if transforms is None:
        return None
    return aud.Compose(
        [hydra.utils.instantiate(transform) for transform in transforms])
Пример #3
0
def get_transform(cfg):
    def get_object(trans):
        params = trans.params if trans.params is not None else {}

        if trans.name in {"Compose", "OneOf"}:
            augs_tmp = [get_object(aug) for aug in trans.member]
            return getattr(kvt.augmentation, trans.name)(augs_tmp, **params)

        if hasattr(audi, trans.name):
            return getattr(audi, trans.name)(**params)
        elif hasattr(kvt.augmentation, trans.name):
            return getattr(kvt.augmentation, trans.name)(**params)
        else:
            return eval(trans.name)(**params)

    augs = [get_object(t) for t in cfg]

    return audi.Compose(augs)
Пример #4
0
    def fasfasa():
        import audiomentations

        from src import core

        crop_size = 98303  # 768 * 128 - 1
        transforms = audiomentations.Compose([
            core.transforms.ToMono(),
            core.transforms.Squeeze(),
            core.transforms.ToNumpy(),
            audiomentations.TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
            audiomentations.PitchShift(min_semitones=-4,
                                       max_semitones=4,
                                       p=0.5),
            core.transforms.ToTorch()
        ])
        train_dataset = DSD100(root=Path.home() / 'Data' / 'Audio' / 'DSD100',
                               crop_size=crop_size,
                               transforms=transforms)

        for i, e in zip(range(10), train_dataset):
            print(e)
Пример #5
0
            "target" : label,
            "id" : record['recording_id']
        }
###############################
#Augmentations
###############################
import audiomentations as AA

train_audio_transform = AA.Compose([
    AA.AddGaussianNoise(p=0.5),
    AA.AddGaussianSNR(p=0.5),
    #AA.AddBackgroundNoise("../input/train_audio/", p=1)
    #AA.AddImpulseResponse(p=0.1),
    #AA.AddShortNoises("../input/train_audio/", p=1)
    #AA.FrequencyMask(min_frequency_band=0.0,  max_frequency_band=0.2, p=0.1),
    #AA.TimeMask(min_band_part=0.0, max_band_part=0.2, p=0.1),
    #AA.PitchShift(min_semitones=-0.5, max_semitones=0.5, p=0.1),
    #AA.Shift(p=0.1),
    #AA.Normalize(p=0.1),
    #AA.ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=1, p=0.05),
    #AA.PolarityInversion(p=0.05),
    #AA.Gain(p=0.2)
])


###############################
#Utils
###############################
def _lwlrap_sklearn(truth, scores):
    """Reference implementation from https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8"""
    sample_weight = np.sum(truth > 0, axis=1)
Пример #6
0
for fold, (t_idx, v_idx) in enumerate(kfold.split(X, y)):
    train_gby.loc[v_idx, "kfold"] = fold

train_df = train_df.merge(train_gby[["recording_id", "kfold"]],
                          on="recording_id",
                          how="left")
print(train_df.kfold.value_counts())
train_df.to_csv(OUTPUT_DIR / "folds.csv", index=False)
species_fmin_fmax.to_csv(OUTPUT_DIR / "species_fmin_fmax.csv", index=False)

################################################
# audiomentations #
################################################
augmenter = A.Compose([
    A.AddGaussianNoise(min_amplitude=0.01, max_amplitude=0.03, p=0.2),
    A.PitchShift(min_semitones=-3, max_semitones=3, p=0.2),
    A.Gain(p=0.2)
])

################################################
# Dataset #
################################################


def cut_spect(spect: torch.Tensor, fmin_mel: int, fmax_mel: int):
    return spect[fmin_mel:fmax_mel]


def do_normalize(img: torch.Tensor):
    bs, ch, w, h = img.shape
    _img = img.clone()
    def __init__(self,
                 dir,
                 extract_chunks=True,
                 sample_rate=16000,
                 num_fbanks=40,
                 label_delay=100,
                 no_augment=False,
                 **kwargs):
        self.extract_chunks = extract_chunks
        self.min_length = kwargs["min_chunk_length"]
        self.max_length = kwargs["max_chunk_length"]
        self.sample_rate = sample_rate
        self.num_fbanks = num_fbanks
        self.label_delay = label_delay
        reco2wav = {}
        reco2trs = {}
        with open(f"{dir}/wav.scp") as f:
            for l in f:
                ss = l.split()
                reco2wav[ss[0]] = ss[1]

        with open(f"{dir}/reco2trs.scp") as f:
            for l in f:
                ss = l.split()
                reco2trs[ss[0]] = ss[1]

        self.sections = []
        # Sections are of different length;
        # we add a section section_length/avg_chunk_len + 1 number of times
        self.index2section = []

        avg_chunk_len = self.max_length - self.min_length

        for reco in tqdm.tqdm(
                reco2trs.keys(),
                desc=f"Loading transcriptions and audios for {dir}"):
            try:
                transcription = trs.Transcritpion(reco2wav[reco],
                                                  reco2trs[reco])
                for section in transcription.get_speech_sections():
                    self.sections.append(section)
                    section_length = section.wav_tensor.shape[0] / sample_rate
                    if extract_chunks:
                        self.index2section.extend(
                            [len(self.sections) - 1] *
                            int(section_length // avg_chunk_len + 1))
                    else:
                        self.index2section.append(len(self.sections) - 1)
            except:
                logging.warn(f"Cannot load transcription/audio for {reco}",
                             exc_info=True)

        self.augment = None
        if not no_augment:
            augmentations = []
            if kwargs["rir_dir"] != "":
                augmentations.append(
                    audiomentations.AddImpulseResponse(
                        ir_path=kwargs["rir_dir"], p=0.3, lru_cache_size=1024))
            if kwargs["noise_dir"] != "":
                augmentations.append(
                    audiomentations.AddBackgroundNoise(
                        sounds_path=kwargs["noise_dir"],
                        p=0.3,
                        lru_cache_size=1024))
            if kwargs["short_noise_dir"] != "":
                augmentations.append(
                    audiomentations.AddShortNoises(
                        sounds_path=kwargs["short_noise_dir"],
                        p=0.3,
                        lru_cache_size=1024))
            if len(augmentations) > 0:
                self.augment = audiomentations.Compose(augmentations)
Пример #8
0
def get_training_augmentation():
    train_transform = [
        # audi.Normalize(),
    ]

    return audi.Compose(train_transform)
Пример #9
0
def get_test_augmentation():
    """Add paddings to make audio shape divisible by 32"""
    test_transform = [
        # audi.Normalize(),
    ]
    return audi.Compose(test_transform)
import audiomentations
import cv2

from model.mixers import UseMixerWithProb, RandomMixer, SigmoidConcatMixer, AddMixer, SigmoidVerticalConcatMixer
from model.random_resized_crop import RandomResizedCrop, RandomResizedCrop2
from model.transforms import Compose, UseWithProb, SpecAugment, SpectreScale, PreprocessMelImage, GaussNoise, OneOf, \
    PadToSize, RandomCrop, PreprocessSingleChannelMelImage

wave_augmentations = {
    0: None,
    1: audiomentations.Compose([
        audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        audiomentations.TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        audiomentations.Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    ], p=0.96),  # (1-(1-0.5)^4)*0.96==0.9 - In total there will be 90% augmented samples
    2: audiomentations.Compose([
        audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.010, p=0.95),
        audiomentations.Shift(min_fraction=-0.1, max_fraction=0.1, p=0.3),
    ], p=1),
}

size_4_sec_750_hop = 256

_base_mel_post_process = {
    'none': [],
    '3ch_1': [
        # Use librosa.feature.delta with order 1 and 2 for creating 2 additional channels then divide by 100
        PreprocessMelImage(),
    ],
    '1ch_1': [PreprocessSingleChannelMelImage(), ],