Exemplo n.º 1
0
    def get_dataloader(self):
        vx = VOXFORGE(args.data_path,
                      langs=args.languages,
                      label_type="lang",
                      use_cache=args.use_cache,
                      use_precompute=args.use_precompute)
        if self.model_name == "resnet34_conv" or self.model_name == "resnet101_conv":
            T = tat.Compose([
                #tat.PadTrim(self.max_len),
                tat.MEL(n_mels=224),
                tat.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Resize((224, 224)),
                tvt.ToTensor(),
            ])
            TT = spl_transforms.LENC(vx.LABELS)
        elif self.model_name == "resnet34_mfcc":
            sr = 16000
            ws = 800
            hs = ws // 2
            n_fft = 512  # 256
            n_filterbanks = 26
            n_coefficients = 12
            low_mel_freq = 0
            high_freq_mel = (2595 * math.log10(1 + (sr / 2) / 700))
            mel_pts = torch.linspace(low_mel_freq, high_freq_mel,
                                     n_filterbanks + 2)  # sr = 16000
            hz_pts = torch.floor(700 * (torch.pow(10, mel_pts / 2595) - 1))
            bins = torch.floor((n_fft + 1) * hz_pts / sr)
            td = {
                "RfftPow": spl_transforms.RfftPow(n_fft),
                "FilterBanks": spl_transforms.FilterBanks(n_filterbanks, bins),
                "MFCC": spl_transforms.MFCC(n_filterbanks, n_coefficients),
            }

            T = tat.Compose([
                tat.Scale(),
                #tat.PadTrim(self.max_len, fill_value=1e-8),
                spl_transforms.Preemphasis(),
                spl_transforms.Sig2Features(ws, hs, td),
                spl_transforms.DummyDim(),
                tat.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Resize((224, 224)),
                tvt.ToTensor(),
            ])
            TT = spl_transforms.LENC(vx.LABELS)
        vx.transform = T
        vx.target_transform = TT
        if args.use_precompute:
            vx.load_precompute(args.model_name)
        dl = data.DataLoader(vx,
                             batch_size=args.batch_size,
                             num_workers=args.num_workers,
                             shuffle=True)
        return vx, dl
Exemplo n.º 2
0
 def test4(self):
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="lang",
                   num_zips=10,
                   randomize=False,
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
     ])
     TT = spl_transforms.LENC(vx.LABELS)
     vx.transform = T
     vx.target_transform = TT
     print(vx.splits)
     dl = data.DataLoader(vx, batch_size=5)
     total_train = 0
     for i, (mb, l) in enumerate(dl):
         vx.set_split("train")
         total_train += l.size(0)
         if i == 2:
             vx.set_split("valid")
             total_valid = 0
             for mb_valid, l_valid in dl:
                 total_valid += l_valid.size(0)
             print(total_valid)
     print(total_train)
Exemplo n.º 3
0
    def test1(self):
        # Data
        vx = VOXFORGE(self.bdir, label_type="lang")
        vx.find_max_len()
        print(vx.maxlen)
        T = tat.Compose([
                tat.PadTrim(vx.maxlen),
                spl_transforms.MEL(n_mels=224),
                spl_transforms.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Scale((224, 224)),
                tvt.ToTensor(),
            ])
        TT = spl_transforms.LENC(vx.LABELS)
        vx.transform = T
        vx.target_transform = TT
        dl = data.DataLoader(vx, batch_size = 25, shuffle=True)

        # Model and Loss
        model = models.squeezenet.squeezenet(True)
        model.train()

        for i, (mb, tgts) in enumerate(dl):
            vx.set_split("train")
            out = model(Variable(mb))
            print(mb.size(), mb.min(), mb.max())
            print(out.data.size())
            print(out.data)
            break
Exemplo n.º 4
0
    def test1(self):
        # Data
        vx = VOXFORGE(self.bdir, label_type="lang", use_cache=True)
        #vx.find_max_len()
        vx.maxlen = 150000
        T = tat.Compose([
                tat.PadTrim(vx.maxlen),
                tat.MEL(n_mels=224),
                tat.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Scale((224, 224)),
                tvt.ToTensor(),
            ])
        TT = spl_transforms.LENC(vx.LABELS)
        vx.transform = T
        vx.target_transform = TT
        dl = data.DataLoader(vx, batch_size = 25, shuffle=True)

        # Model and Loss
        model = models.resnet.resnet34(True)
        print(model)
        criterion = nn.CrossEntropyLoss()
        plist = nn.ParameterList()
        #plist.extend(list(model[0].parameters()))
        plist.extend(list(model[1].fc.parameters()))
        #plist.extend(list(model.parameters()))
        #optimizer = torch.optim.SGD(plist, lr=0.0001, momentum=0.9)
        optimizer = torch.optim.Adam(plist, lr=0.0001)

        train_losses = []
        valid_losses = []
        for i, (mb, tgts) in enumerate(dl):
            model.train()
            vx.set_split("train")
            mb, tgts = Variable(mb), Variable(tgts)
            model.zero_grad()
            out = model(mb)
            loss = criterion(out, tgts)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.data[0])
            print(loss.data[0])
            if i % 5 == 0:
                start = time.time()
                model.eval()
                vx.set_split("valid")
                running_validation_loss = 0
                correct = 0
                for mb_valid, tgts_valid in dl:
                    mb_valid, tgts_valid = Variable(mb_valid), Variable(tgts_valid)
                    out_valid = model(mb_valid)
                    loss_valid = criterion(out_valid, tgts_valid)
                    running_validation_loss += loss_valid.data[0]
                    correct += (out_valid.data.max(1)[1] == tgts_valid.data).sum()
                print_running_time(start)
                valid_losses.append((running_validation_loss, correct / len(vx)))
                print("loss: {}, acc: {}".format(running_validation_loss, correct / len(vx)))
            if i == 11: break
            vx.set_split("train")
Exemplo n.º 5
0
    def test_compose(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)
        maxminmax = max(abs(audio_orig.min()), abs(audio_orig.max())).item()

        tset = (transforms.Scale(factor=maxminmax),
                transforms.PadTrim(max_len=length_new, channels_first=False))
        result = transforms.Compose(tset)(audio_orig)

        self.assertTrue(max(abs(result.min()), abs(result.max())) == 1.)

        self.assertTrue(result.size(0) == length_new)

        repr_test = transforms.Compose(tset)
        self.assertTrue(repr_test.__repr__())
Exemplo n.º 6
0
    def test_compose(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)
        maxminmax = np.abs(
            [audio_orig.min(), audio_orig.max()]).max().astype(np.float)

        tset = (transforms.Scale(factor=maxminmax),
                transforms.PadTrim(max_len=length_new))
        result = transforms.Compose(tset)(audio_orig)

        self.assertTrue(np.abs([result.min(), result.max()]).max() == 1.)

        self.assertTrue(result.size(0) == length_new)

        repr_test = transforms.Compose(tset)
        repr_test.__repr__()
Exemplo n.º 7
0
def load_dataset(dataset='VCTK', train_subset=1.0, person_filter=None):

    transfs = transforms.Compose([
        transforms.Scale(),
        prepro.DB_Spec(n_fft=400, hop_t=0.010, win_t=0.025)
    ])

    if dataset == 'VCTK':
        person_filter = [
            'p249', 'p239', 'p276', 'p283', 'p243', 'p254', 'p258', 'p271'
        ]
        train_dataset = vctk_custom_dataset.VCTK('../datasets/VCTK-Corpus/',
                                                 preprocessed=True,
                                                 person_filter=person_filter,
                                                 filter_mode='exclude')
        test_dataset = vctk_custom_dataset.VCTK('../datasets/VCTK-Corpus/',
                                                preprocessed=True,
                                                person_filter=person_filter,
                                                filter_mode='include')
    elif dataset == 'LibriSpeech':
        train_dataset = librispeech_custom_dataset.LibriSpeech(
            '../datasets/LibriSpeech/',
            preprocessed=True,
            split='train',
            person_filter=person_filter,
            filter_mode='include')
        test_dataset = librispeech_custom_dataset.LibriSpeech(
            '../datasets/LibriSpeech/',
            preprocessed=True,
            split='test',
            person_filter=person_filter,
            filter_mode='include')

    indices = list(range(len(train_dataset)))
    split = int(np.floor(len(train_dataset) * train_subset))

    train_sampler = sampler.RandomSampler(
        sampler.SubsetRandomSampler(indices[:split]))
    test_sampler = sampler.RandomSampler(test_dataset)

    kwargs = {'num_workers': 8, 'pin_memory': True} if args.use_cuda else {}
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               drop_last=False,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size,
                                              sampler=test_sampler,
                                              drop_last=False,
                                              **kwargs)

    return train_loader, test_loader, train_dataset, test_dataset
def get_loader(config, data_dir):
    root = os.path.join(os.path.abspath(os.curdir), data_dir)
    print('-- Loading audios')
    dataset = AudioFolder(root=root,
                          transform=transforms.Compose([
                              transforms.PadTrim(133623, 0),
                              transforms.LC2CL()
                          ]))
    loader = DataLoader(dataset=dataset,
                        batch_size=config.batch_size,
                        shuffle=True,
                        num_workers=config.num_workers)
    return loader
Exemplo n.º 9
0
def input_fn(request_body, content_type='application/json'):
    logger.info('Deserializing the input data.')
    if content_type == 'application/json':
        input_data = json.loads(request_body)
        url = input_data['url']
        logger.info(f'Image url: {url}')
        image_data = Image.open(requests.get(url, stream=True).raw)

        image_transform = transforms.Compose([
            transforms.Resize(size=256),
            transforms.CenterCrop(size=224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        return image_transform(image_data)
    raise Exception(f'Requested unsupported ContentType in content_type {content_type}')
Exemplo n.º 10
0
 def test4(self):
     ds = AUDIOSET(self.bdir)
     T = transforms.Compose([
         transforms.PadTrim(ds.maxlen),
     ])
     TT = mgc_transforms
     vx.transform = T
     dl = data.DataLoader(vx, collate_fn=bce_collate, batch_size=5)
     total_train = 0
     for i, (mb, l) in enumerate(dl):
         total_train += l.size(0)
         if i == 2:
             #ds.set_split("valid")
             total_valid = 0
             for mb_valid, l_valid in dl:
                 total_valid += l_valid.size(0)
             print(total_valid)
     print(total_train)
Exemplo n.º 11
0
 def test3(self):
     """
         Test that the data loader does transforms
     """
     ds = AUDIOSET(self.bdir, randomize=True)
     T = transforms.Compose([
         transforms.PadTrim(ds.maxlen),
         mgc_transforms.MEL(),
         mgc_transforms.BLC2CBL()
     ])
     TT = mgc_transforms.BinENC(ds.labels_dict)
     ds.transform = T
     ds.target_transform = TT
     dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5)
     labels_total = 0
     print(ds.labels_dict)
     for i, (a, b) in enumerate(dl):
         print(a.size(), b.size())
         if i > 10: break
Exemplo n.º 12
0
 def test3(self):
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="lang",
                   num_zips=10,
                   randomize=False,
                   split="valid",
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
         spl_transforms.MEL(),
         spl_transforms.BLC2CBL()
     ])
     TT = spl_transforms.LENC(vx.LABELS)
     vx.transform = T
     vx.target_transform = TT
     dl = data.DataLoader(vx, batch_size=5)
     labels_total = 0
     for i, (a, b) in enumerate(dl):
         labels_total += b.sum()
     print((len(vx) - labels_total) / len(vx))
Exemplo n.º 13
0
 def test5(self):
     import numpy as np
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="prompts",
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
     ])
     TT = spl_transforms.WC()
     vx.transform = T
     vx.target_transform = TT
     print(vx.splits)
     dl = data.DataLoader(vx, batch_size=5, collate_fn=basic_collate)
     max_wc = 0
     wc_all = []
     for i, (mb, tgts) in enumerate(dl):
         sorted(tgts)
         max_wc = tgts[-1] if tgts[-1] > max_wc else max_wc
         wc_all.extend(tgts)
     print(np.histogram(wc_all, bins=20), len(wc_all))
Exemplo n.º 14
0
    def test1(self):
        """
            Test that the data loader does transforms
        """

        NMELS = 224

        ds = AUDIOSET(self.bdir, randomize=True)
        T = transforms.Compose([
            transforms.PadTrim(ds.maxlen),
            mgc_transforms.MEL(n_mels=NMELS),
            mgc_transforms.BLC2CBL()
        ])
        TT = mgc_transforms.BinENC(ds.labels_dict)
        ds.transform = T
        ds.target_transform = TT
        dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5)
        labels_total = 0
        for i, (a, b) in enumerate(dl):
            print(a.size(), b.size())
            break

        self.assertTrue(a.size()[-2:] == (NMELS, 313))
Exemplo n.º 15
0
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from loader_audioset import AUDIOSET
import mgc_transforms
import torchaudio.transforms as tat

AUDIOSET_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "data", "audioset")

DATASET = "balanced"

IMG_SIZE = (10, 5)
CMAP_COLOR = "jet"

T = tat.Compose([
    #tat.PadTrim(self.max_len),
    mgc_transforms.MEL(sr=16000, n_fft=800, hop_length=320, n_mels=224),
    mgc_transforms.BLC2CBL(),
    #mgc_transforms.Scale(),
])
ds = AUDIOSET(AUDIOSET_PATH, transform=T ,dataset=DATASET, num_samples=1)

rev_labeler = {x["label_id"]: x["name"] for _, x in ds.labels_dict.items()}

for sample, label in ds:
    sample.squeeze_()
    sample = sample.numpy()
    sample = np.log(sample)
    sample -= sample.min()

    plt.figure(figsize=IMG_SIZE)
    plt.title("MEL Spectrogram of {} Audio".format(rev_labeler[label[0]].capitalize()))
    plt.imshow(sample, interpolation='nearest',
Exemplo n.º 16
0
import torch
import torchaudio.datasets as dset
from torchaudio import transforms

transform = transforms.Compose(
    [transforms.Scale(), transforms.PadTrim(100000)])

train_dataset = dset.YESNO("data", transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=10,
)

for i, (input, target) in enumerate(train_loader):
    import ipdb
    ipdb.set_trace(context=21)
    print("HI")
""" Vision MNIST test"""
"""
import torchvision.datasets as vdset
from torchvision import transforms as vtransforms

transform = vtransforms.Compose([
        vtransforms.ToTensor()
        ])

mnist = vdset.MNIST("data", transform=transform, download=True)

mnist_loader = torch.utils.data.DataLoader(
  mnist,
  batch_size=10,
Exemplo n.º 17
0
def evaluate():
  num_classes = 4

  # Init logger
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
  log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w')
  print_log('save path : {}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
  print_log("Random Seed: {}".format(args.manualSeed), log)
  print_log("python version : {}".format(sys.version.replace('\n', ' ')), log)
  print_log("torch  version : {}".format(torch.__version__), log)
  print_log("cudnn  version : {}".format(torch.backends.cudnn.version()), log)

  # Any other preprocessings? http://pytorch.org/audio/transforms.html
  sample_length = 10000
  scale = transforms.Scale()
  padtrim = transforms.PadTrim(sample_length)
  transforms_audio = transforms.Compose([
    scale, padtrim
  ])


  # Data loading
  fs, data = wavfile.read(args.file_name)
  data = torch.from_numpy(data).float()
  data = data.unsqueeze(1)
  audio = transforms_audio(data)
  audio = Variable(audio)
  audio = audio.view(1, -1)
  audio = audio.unsqueeze(0)


  #Feed in respective model file to pass into model (alexnet.py)
  print_log("=> creating model '{}'".format(args.arch), log)

  # Init model, criterion, and optimizer
  # net = models.__dict__[args.arch](num_classes)
  net = AlexNet(num_classes)
  print_log("=> network :\n {}".format(net), log)


  #Sets use for GPU if available
  if args.use_cuda:
    net.cuda()

  # optionally resume from a checkpoint
  # Need same python version that the resume was in
  if args.resume:
    if os.path.isfile(args.resume):
      print_log("=> loading checkpoint '{}'".format(args.resume), log)
      if args.ngpu == 0:
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
      else:
        checkpoint = torch.load(args.resume)

      recorder = checkpoint['recorder']
      args.start_epoch = checkpoint['epoch']
      net.load_state_dict(checkpoint['state_dict'])
      print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log)
    else:
      print_log("=> no checkpoint found at '{}'".format(args.resume), log)
  else:
    print_log("=> do not use any checkpoint for {} model".format(args.arch), log)

  net.eval()
  if args.use_cuda:
    audio = audio.cuda()
  output = net(audio)
  print(output)
  # TODO postprocess output to a string representing the person speaking
  # ouptut = val_dataset.postprocess_target(output)
  return
Exemplo n.º 18
0
low_mel_freq = 0
high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
mel_pts = np.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2)
hz_pts = np.floor(700 * (10**(mel_pts / 2595) - 1))
bins = np.floor((n_fft + 1) * hz_pts / sr)

# data transformations
td = {
    "RfftPow": RfftPow(n_fft),
    "FilterBanks": FilterBanks(n_filterbanks, bins),
    "MFCC": MFCC(n_filterbanks, n_coefficients),
}

transforms = tat.Compose([
    tat.Scale(),
    tat.PadTrim(58000, fill_value=1e-8),
    Preemphasis(),
    Sig2Features(ws, hs, td),
])

# set network parameters
use_cuda = torch.cuda.is_available()
batch_size = args.batch_size
input_features = 26
hidden_size = 100
output_size = 3
#output_length = (8 + 7 + 2) # with "blanks"
output_length = 8  # without blanks
n_layers = 1
attn_modus = "dot"

# build networks, criterion, optimizers, dataset and dataloader
Exemplo n.º 19
0
    seq_M = args.seq_M
    batch_size = args.batch_size
    depth = args.depth
    radixs = [2] * depth
    N = np.prod(radixs)
    channels = args.channels
    lr = args.lr
    steps = args.steps
    c = args.c
    generation_time = args.file_size
    filename = args.outfile

    maxlen = 50000
    print('==> Downloading YesNo Dataset..')
    transform = transforms.Compose(
        [transforms.Scale(),
         transforms.PadTrim(maxlen),
         transforms.MuLawEncoding(quantization_channels=channels)])
    data = torchaudio.datasets.YESNO('./data', download=True, transform=transform)
    data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=True)

    print('==> Building model..')
    net = general_FFTNet(radixs, 128, channels).cuda()

    print(sum(p.numel() for p in net.parameters() if p.requires_grad), "of parameters.")

    optimizer = optim.Adam(net.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    print("Start Training.")
    a = datetime.now().replace(microsecond=0)
Exemplo n.º 20
0
import argparse
import torch
import torchaudio
from torchaudio import transforms, save
import numpy as np
import ujson
from vctk_custom_dataset import VCTK
import librosa
from audio_utils import griffinlim

import audio_utils as prepro

parser = argparse.ArgumentParser(description='Prepare VCTK Dataset')

parser.add_argument('--vctk-path',
                    type=str,
                    metavar='S',
                    required=True,
                    help='(path to VCTK-Corpus)')

args = parser.parse_args()

transfs = transforms.Compose(
    [prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025)])

dataset = VCTK(root=args.vctk_path, transform=transfs)
Exemplo n.º 21
0
def create_data_pipelines(H):
    vocab = Vocabulary(os.path.join(H.ROOT_DIR, H.EXPERIMENT),
                       encoding=H.TARGET_ENCODING)

    augmentation_transform = transforms.Compose([
        AudioNoiseInjection(probability=H.NOISE_BG_PROBABILITY,
                            noise_levels=H.NOISE_BG_LEVELS,
                            noise_dir=H.NOISE_BG_DIR),
        AudioNoiseGeneration(probability=H.AUDIO_NOISE_PROBABILITY,
                             noise_levels=H.AUDIO_NOISE_LEVELS,
                             noise_colors=H.AUDIO_NOISE_COLORS),
        AudioPitchShift(probability=H.AUDIO_PITCH_PROBABILITY,
                        sample_rate=H.AUDIO_SAMPLE_RATE,
                        pitch_pm=H.AUDIO_PITCH_PM),
        AudioTimeStrech(probability=H.AUDIO_SPEED_PROBABILITY,
                        low_high=H.AUDIO_SPEED_LOW_HIGH),
        AudioDynamicRange(probability=H.AUDIO_DYNAMIC_PROBABILITY,
                          low_high=H.AUDIO_DYNAMIC_LOW_HIGH),
        AudioTimeShift(probability=H.AUDIO_SHIFT_PROBABILITY,
                       sample_rate=H.AUDIO_SAMPLE_RATE,
                       min_max=H.AUDIO_SHIFT_MIN_MAX),
    ])

    audio_transform_train = transforms.Compose([
        AudioAugmentation(augmentation_transform,
                          probability=H.AUGMENTATION_PROBABILITY),
        AudioNormalizeDB(db=H.NORMALIZE_DB, max_gain_db=H.NORMALIZE_MAX_GAIN),
        AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE,
                         window_size=H.SPECT_WINDOW_SIZE,
                         window_stride=H.SPECT_WINDOW_STRIDE,
                         window=H.SPECT_WINDOW),
        AudioNormalize(),
        FromNumpyToTensor(tensor_type=torch.FloatTensor)
    ])

    audio_transform = transforms.Compose([
        AudioNormalizeDB(db=H.NORMALIZE_DB, max_gain_db=H.NORMALIZE_MAX_GAIN),
        AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE,
                         window_size=H.SPECT_WINDOW_SIZE,
                         window_stride=H.SPECT_WINDOW_STRIDE,
                         window=H.SPECT_WINDOW),
        AudioNormalize(),
        FromNumpyToTensor(tensor_type=torch.FloatTensor)
    ])

    if 'ctc' in H.TARGET_ENCODING:
        label_transform = transforms.Compose([
            TranscriptEncodeCTC(vocab),
            FromNumpyToTensor(tensor_type=torch.LongTensor)
        ])
    elif 'sts' in H.TARGET_ENCODING:
        label_transform = transforms.Compose([
            TranscriptEncodeSTS(vocab),
            FromNumpyToTensor(tensor_type=torch.LongTensor)
        ])
    else:
        raise ValueError('TARGET_ENCODING value not valid.')

    train_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT),
                                 manifests_files=H.MANIFESTS,
                                 datasets=["train", "pseudo"],
                                 transform=audio_transform_train,
                                 label_transform=label_transform,
                                 max_data_size=None,
                                 sorted_by='recording_duration',
                                 min_max_duration=H.MIN_MAX_AUDIO_DURATION,
                                 min_max_length=H.MIN_MAX_TRANSCRIPT_LEN,
                                 min_confidence=H.MIN_TRANSCRIPT_CONFIDENCE)

    train_sampler = BucketingSampler(train_dataset, batch_size=H.BATCH_SIZE)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               num_workers=H.NUM_WORKERS,
                                               batch_sampler=train_sampler,
                                               collate_fn=collate_fn,
                                               pin_memory=True)

    logger.info(train_dataset)

    valid_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT),
                                 manifests_files=H.MANIFESTS,
                                 datasets="test",
                                 transform=audio_transform,
                                 label_transform=label_transform,
                                 max_data_size=None,
                                 sorted_by='recording_duration')

    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=H.BATCH_SIZE,
                                               num_workers=H.NUM_WORKERS,
                                               shuffle=False,
                                               collate_fn=collate_fn,
                                               pin_memory=True)

    logger.info(valid_dataset)

    return train_loader, valid_loader, vocab
Exemplo n.º 22
0
def test(args):

    if not os.path.exists('experiments'):
        os.makedirs('experiments')

    transfs = transforms.Compose([
        # transforms.Scale(),
        prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025)
    ])

    # mel_basis = librosa.filters.mel(16000, 256, n_mels=80, norm=1)
    # sr = 16000

    if args.model_type == 'vae_g_l':
        model = vae_g_l.VAE(args)
        model.load_state_dict(
            torch.load('experiments/' + args.model_name,
                       map_location=lambda storage, loc: storage))
    elif args.model_type == 'vae_l':
        model = vae_l.VAE(args)
        model.load_state_dict(
            torch.load('experiments/' + args.model_name,
                       map_location=lambda storage, loc: storage))

    model.eval()

    if args.dataset == "VCTK":
        # male example
        # data, sr = prepro.read_audio('/work/invx030/datasets/VCTK-Corpus/wav48/p245/p245_002.wav')
        # Female example
        data, sr = prepro.read_audio(
            '/work/invx030/datasets/VCTK-Corpus/wav48/p233/p233_003.wav')
    elif args.dataset == "LibriSpeech":
        # male
        # data, sr = prepro.read_audio('/work/invx030/datasets/LibriSpeech/test-clean/1089/134686/1089-134686-0001.flac')
        # female
        data, sr = prepro.read_audio(
            '/work/invx030/datasets/LibriSpeech/test-clean/4507/16021/4507-16021-0001.flac'
        )
    else:
        raise Exception('No valid dataset provided (use --dataset)')

    hop_length = int(sr * 0.010)
    n_fft = 400
    win_length = int(sr * 0.025)

    data = transfs(data)
    data = data / (torch.min(data))

    data = Variable(data)
    data = data.unsqueeze(0)

    data = data.transpose(1, 2)
    original = data

    if args.predictive:
        data = F.pad(data, (0, 0, 1, 0), "constant", 1.)
        original = F.pad(original, (0, 0, 0, 1), "constant", 1.)

    outs = model(data)
    reconstruction = outs.decoder_out
    reconstruction = reconstruction.transpose(1, 2)
    reconstruction = reconstruction.squeeze(0)
    reconstruction = (reconstruction.data.cpu()).numpy()
    reconstruction = reconstruction * -80.

    original = original.transpose(1, 2)
    original = original.squeeze(0).squeeze(0)
    original = (original.data.cpu()).numpy()
    original = original * -80.

    librosa.display.specshow(original,
                             sr=sr,
                             hop_length=hop_length,
                             x_axis='time',
                             y_axis='linear',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Original DB spectrogram')
    pylab.savefig('experiments/original_spec.png')

    plt.clf()

    librosa.display.specshow(reconstruction,
                             sr=sr,
                             hop_length=hop_length,
                             x_axis='time',
                             y_axis='linear',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Reconstruction DB spectrogram')
    pylab.savefig('experiments/reconstruction_spec.png')

    inverse = to_audio(original, sr=sr, n_fft=n_fft, hop_t=0.010, win_t=0.025)

    librosa.output.write_wav('experiments/original.wav',
                             inverse,
                             sr,
                             norm=True)

    inverse = to_audio(reconstruction,
                       sr,
                       n_fft=n_fft,
                       hop_t=0.010,
                       win_t=0.025)
    librosa.output.write_wav('experiments/reconstruction.wav',
                             inverse,
                             sr,
                             norm=True)
Exemplo n.º 23
0
    seq_M = args.seq_M
    batch_size = args.batch_size
    depth = args.depth
    radixs = [2] * depth
    N = np.prod(radixs)
    channels = args.channels
    lr = args.lr
    steps = args.steps
    c = args.c
    generation_time = args.file_size
    filename = args.outfile
    features_size = args.feature_size

    print('==> Downloading YesNo Dataset..')
    transform = transforms.Compose([transforms.Scale()])
    data = torchaudio.datasets.YESNO('./data',
                                     download=True,
                                     transform=transform)
    data_loader = DataLoader(data, batch_size=1, num_workers=2)

    print('==> Extracting features..')
    train_wav = []
    train_features = []
    train_targets = []
    for batch_idx, (inputs, _) in enumerate(data_loader):
        inputs = inputs.view(-1).numpy()
        targets = np.roll(inputs, shift=-1)

        #h = mfcc(inputs, sr, winlen=winlen, winstep=winstep, numcep=features_size - 1, winfunc=np.hamming)
        x = inputs.astype(float)
Exemplo n.º 24
0
def main():
  # Init logger
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
  log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w')
  print_log('save path : {}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
  print_log("Random Seed: {}".format(args.manualSeed), log)
  print_log("python version : {}".format(sys.version.replace('\n', ' ')), log)
  print_log("torch  version : {}".format(torch.__version__), log)
  print_log("cudnn  version : {}".format(torch.backends.cudnn.version()), log)

  # Data loading code
  # Any other preprocessings? http://pytorch.org/audio/transforms.html
  sample_length = 10000
  scale = transforms.Scale()
  padtrim = transforms.PadTrim(sample_length)
  downmix = transforms.DownmixMono()
  transforms_audio = transforms.Compose([
    scale, padtrim, downmix
  ])

  if not os.path.isdir(args.data_path):
    os.makedirs(args.data_path)
  train_dir = os.path.join(args.data_path, 'train')
  val_dir = os.path.join(args.data_path, 'val')

  #Choose dataset to use
  if args.dataset == 'arctic':
    # TODO No ImageFolder equivalent for audio. Need to create a Dataset manually
    train_dataset = Arctic(train_dir, transform=transforms_audio, download=True)
    val_dataset = Arctic(val_dir, transform=transforms_audio, download=True)
    num_classes = 4
  elif args.dataset == 'vctk':
    train_dataset = dset.VCTK(train_dir, transform=transforms_audio, download=True)
    val_dataset = dset.VCTK(val_dir, transform=transforms_audio, download=True)
    num_classes = 10
  elif args.dataset == 'yesno':
    train_dataset = dset.YESNO(train_dir, transform=transforms_audio, download=True)
    val_dataset = dset.YESNO(val_dir, transform=transforms_audio, download=True)
    num_classes = 2
  else:
    assert False, 'Dataset is incorrect'

  train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.workers,
    # pin_memory=True, # What is this?
    # sampler=None     # What is this?
  )
  val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)


  #Feed in respective model file to pass into model (alexnet.py)
  print_log("=> creating model '{}'".format(args.arch), log)
  # Init model, criterion, and optimizer
  # net = models.__dict__[args.arch](num_classes)
  net = AlexNet(num_classes)
  #
  print_log("=> network :\n {}".format(net), log)

  # net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu)))

  # define loss function (criterion) and optimizer
  criterion = torch.nn.CrossEntropyLoss()

  # Define stochastic gradient descent as optimizer (run backprop on random small batch)
  optimizer = torch.optim.SGD(net.parameters(), state['learning_rate'], momentum=state['momentum'],
                weight_decay=state['decay'], nesterov=True)

  #Sets use for GPU if available
  if args.use_cuda:
    net.cuda()
    criterion.cuda()

  recorder = RecorderMeter(args.epochs)
  # optionally resume from a checkpoint
  # Need same python vresion that the resume was in 
  if args.resume:
    if os.path.isfile(args.resume):
      print_log("=> loading checkpoint '{}'".format(args.resume), log)
      if args.ngpu == 0:
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
      else:
        checkpoint = torch.load(args.resume)

      recorder = checkpoint['recorder']
      args.start_epoch = checkpoint['epoch']
      net.load_state_dict(checkpoint['state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer'])
      print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log)
    else:
      print_log("=> no checkpoint found at '{}'".format(args.resume), log)
  else:
    print_log("=> do not use any checkpoint for {} model".format(args.arch), log)

  if args.evaluate:
    validate(val_loader, net, criterion, 0, log, val_dataset)
    return

  # Main loop
  start_time = time.time()
  epoch_time = AverageMeter()

  # Training occurs here
  for epoch in range(args.start_epoch, args.epochs):
    current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule)

    need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch))
    need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs)

    print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:6.4f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \
                + ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log)

    print("One epoch")
    # train for one epoch
    # Call to train (note that our previous net is passed into the model argument)
    train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log, train_dataset)

    # evaluate on validation set
    #val_acc,   val_los   = extract_features(test_loader, net, criterion, log)
    val_acc,   val_los   = validate(val_loader, net, criterion, epoch, log, val_dataset)
    is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc)

    save_checkpoint({
      'epoch': epoch + 1,
      'arch': args.arch,
      'state_dict': net.state_dict(),
      'recorder': recorder,
      'optimizer' : optimizer.state_dict(),
    }, is_best, args.save_path, 'checkpoint.pth.tar')

    # measure elapsed time
    epoch_time.update(time.time() - start_time)
    start_time = time.time()
    recorder.plot_curve( os.path.join(args.save_path, 'curve.png') )

  log.close()
Exemplo n.º 25
0
## Setting seed
import random

param.seed = param.seed or random.randint(1, 10000)
print("Random Seed: " + str(param.seed))
print("Random Seed: " + str(param.seed), file=log_output)
random.seed(param.seed)
torch.manual_seed(param.seed)
if param.cuda:
    torch.cuda.manual_seed_all(param.seed)

## Transforming audio files
trans = transf.Compose([
    transf.Scale(),  # This makes it into [-1,1]
    # transf.ToTensor(),
    transf.PadTrim(max_len=param.audio_size),  # I don't know if this is needed
    # This makes it into [-1,1] so tanh will work properly
    # transf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


def load_sound(path):
    tensor_to_load_into = None
    import torchaudio
    out, sample_rate = torchaudio.load(path, tensor_to_load_into)
    return out


## Importing dataset
data = dset.DatasetFolder(root=param.input_folder,
                          transform=trans,
Exemplo n.º 26
0
    def get_dataloader(self):
        usl = True if self.loss_criterion == "crossentropy" else False
        ds = AUDIOSET(self.data_path, dataset=self.args.dataset, noises_dir=self.noises_dir,
                      use_cache=False, num_samples=self.args.num_samples,
                      add_no_label=self.args.add_no_label, use_single_label=usl)
        if any(x in self.model_name for x in ["resnet34_conv", "resnet101_conv", "squeezenet"]):
            T = tat.Compose([
                    #tat.PadTrim(self.max_len, fill_value=1e-8),
                    mgc_transforms.SimpleTrim(self.max_len),
                    mgc_transforms.MEL(sr=16000, n_fft=600, hop_length=300, n_mels=self.args.freq_bands//2),
                    #mgc_transforms.Scale(),
                    mgc_transforms.BLC2CBL(),
                    mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)),
                ])
        elif "_mfcc_librosa" in self.model_name:
            T = tat.Compose([
                    #tat.PadTrim(self.max_len, fill_value=1e-8),
                    mgc_transforms.SimpleTrim(self.max_len),
                    mgc_transforms.MFCC2(sr=16000, n_fft=600, hop_length=300, n_mfcc=12),
                    mgc_transforms.Scale(),
                    mgc_transforms.BLC2CBL(),
                    mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)),
                ])
        elif "_mfcc" in self.model_name:
            sr = 16000
            ws = 800
            hs = ws // 2
            n_fft = 512 # 256
            n_filterbanks = 26
            n_coefficients = 12
            low_mel_freq = 0
            high_freq_mel = (2595 * math.log10(1 + (sr/2) / 700))
            mel_pts = torch.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2) # sr = 16000
            hz_pts = torch.floor(700 * (torch.pow(10,mel_pts / 2595) - 1))
            bins = torch.floor((n_fft + 1) * hz_pts / sr)
            td = {
                    "RfftPow": mgc_transforms.RfftPow(n_fft),
                    "FilterBanks": mgc_transforms.FilterBanks(n_filterbanks, bins),
                    "MFCC": mgc_transforms.MFCC(n_filterbanks, n_coefficients),
                 }

            T = tat.Compose([
                    #tat.PadTrim(self.max_len, fill_value=1e-8),
                    mgc_transforms.Preemphasis(),
                    mgc_transforms.SimpleTrim(self.max_len),
                    mgc_transforms.Sig2Features(ws, hs, td),
                    mgc_transforms.DummyDim(),
                    mgc_transforms.Scale(),
                    tat.BLC2CBL(),
                    mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)),
                ])
        elif "attn" in self.model_name:
            T = tat.Compose([
                    mgc_transforms.SimpleTrim(self.max_len),
                    mgc_transforms.MEL(sr=16000, n_fft=600, hop_length=300, n_mels=self.args.freq_bands//2),
                    #mgc_transforms.Scale(),
                    mgc_transforms.SqueezeDim(2),
                    tat.LC2CL(),
                ])
        elif "bytenet" in self.model_name:
            #offset = 714 # make clips divisible by 224
            T = tat.Compose([
                    mgc_transforms.SimpleTrim(self.max_len),
                    #tat.PadTrim(self.max_len),
                    mgc_transforms.Scale(),
                    tat.LC2CL(),
                ])
        ds.transform = T
        if self.loss_criterion == "crossentropy":
            TT = mgc_transforms.XEntENC(ds.labels_dict)
            #TT = mgc_transforms.BinENC(ds.labels_dict, dtype=torch.int64)
        else:
            TT = mgc_transforms.BinENC(ds.labels_dict)
        ds.target_transform = TT
        ds.use_cache = self.use_cache
        if self.use_cache:
            ds.init_cache()
        if self.use_precompute:
            ds.load_precompute(self.model_name)
        dl = data.DataLoader(ds, batch_size=self.batch_size, drop_last=True,
                             num_workers=self.num_workers, collate_fn=bce_collate,
                             shuffle=True)
        if "attn" in self.model_name:
            dl.collate_fn = sort_collate
        return ds, dl