예제 #1
0
    def test_pad_trim(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)

        result = transforms.PadTrim(max_len=length_new,
                                    channels_first=False)(audio_orig)
        self.assertEqual(result.size(0), length_new)

        result = transforms.PadTrim(max_len=length_new,
                                    channels_first=True)(audio_orig.transpose(
                                        0, 1))
        self.assertEqual(result.size(1), length_new)

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 0.8)

        result = transforms.PadTrim(max_len=length_new,
                                    channels_first=False)(audio_orig)

        self.assertEqual(result.size(0), length_new)

        repr_test = transforms.PadTrim(max_len=length_new,
                                       channels_first=False)
        self.assertTrue(repr_test.__repr__())
예제 #2
0
    def test_pad_trim(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)

        result = transforms.PadTrim(max_len=length_new)(audio_orig)

        self.assertTrue(
            result.size(0) == length_new,
            print("old size: {}, new size: {}".format(audio_orig.size(0),
                                                      result.size(0))))

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 0.8)

        result = transforms.PadTrim(max_len=length_new)(audio_orig)

        self.assertTrue(
            result.size(0) == length_new,
            print("old size: {}, new size: {}".format(audio_orig.size(0),
                                                      result.size(0))))

        repr_test = transforms.PadTrim(max_len=length_new)
        repr_test.__repr__()
예제 #3
0
 def test4(self):
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="lang",
                   num_zips=10,
                   randomize=False,
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
     ])
     TT = spl_transforms.LENC(vx.LABELS)
     vx.transform = T
     vx.target_transform = TT
     print(vx.splits)
     dl = data.DataLoader(vx, batch_size=5)
     total_train = 0
     for i, (mb, l) in enumerate(dl):
         vx.set_split("train")
         total_train += l.size(0)
         if i == 2:
             vx.set_split("valid")
             total_valid = 0
             for mb_valid, l_valid in dl:
                 total_valid += l_valid.size(0)
             print(total_valid)
     print(total_train)
예제 #4
0
    def test1(self):
        # Data
        vx = VOXFORGE(self.bdir, label_type="lang")
        vx.find_max_len()
        print(vx.maxlen)
        T = tat.Compose([
                tat.PadTrim(vx.maxlen),
                spl_transforms.MEL(n_mels=224),
                spl_transforms.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Scale((224, 224)),
                tvt.ToTensor(),
            ])
        TT = spl_transforms.LENC(vx.LABELS)
        vx.transform = T
        vx.target_transform = TT
        dl = data.DataLoader(vx, batch_size = 25, shuffle=True)

        # Model and Loss
        model = models.squeezenet.squeezenet(True)
        model.train()

        for i, (mb, tgts) in enumerate(dl):
            vx.set_split("train")
            out = model(Variable(mb))
            print(mb.size(), mb.min(), mb.max())
            print(out.data.size())
            print(out.data)
            break
예제 #5
0
    def test1(self):
        # Data
        vx = VOXFORGE(self.bdir, label_type="lang", use_cache=True)
        #vx.find_max_len()
        vx.maxlen = 150000
        T = tat.Compose([
                tat.PadTrim(vx.maxlen),
                tat.MEL(n_mels=224),
                tat.BLC2CBL(),
                tvt.ToPILImage(),
                tvt.Scale((224, 224)),
                tvt.ToTensor(),
            ])
        TT = spl_transforms.LENC(vx.LABELS)
        vx.transform = T
        vx.target_transform = TT
        dl = data.DataLoader(vx, batch_size = 25, shuffle=True)

        # Model and Loss
        model = models.resnet.resnet34(True)
        print(model)
        criterion = nn.CrossEntropyLoss()
        plist = nn.ParameterList()
        #plist.extend(list(model[0].parameters()))
        plist.extend(list(model[1].fc.parameters()))
        #plist.extend(list(model.parameters()))
        #optimizer = torch.optim.SGD(plist, lr=0.0001, momentum=0.9)
        optimizer = torch.optim.Adam(plist, lr=0.0001)

        train_losses = []
        valid_losses = []
        for i, (mb, tgts) in enumerate(dl):
            model.train()
            vx.set_split("train")
            mb, tgts = Variable(mb), Variable(tgts)
            model.zero_grad()
            out = model(mb)
            loss = criterion(out, tgts)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.data[0])
            print(loss.data[0])
            if i % 5 == 0:
                start = time.time()
                model.eval()
                vx.set_split("valid")
                running_validation_loss = 0
                correct = 0
                for mb_valid, tgts_valid in dl:
                    mb_valid, tgts_valid = Variable(mb_valid), Variable(tgts_valid)
                    out_valid = model(mb_valid)
                    loss_valid = criterion(out_valid, tgts_valid)
                    running_validation_loss += loss_valid.data[0]
                    correct += (out_valid.data.max(1)[1] == tgts_valid.data).sum()
                print_running_time(start)
                valid_losses.append((running_validation_loss, correct / len(vx)))
                print("loss: {}, acc: {}".format(running_validation_loss, correct / len(vx)))
            if i == 11: break
            vx.set_split("train")
def get_loader(config, data_dir):
    root = os.path.join(os.path.abspath(os.curdir), data_dir)
    print('-- Loading audios')
    dataset = AudioFolder(root=root,
                          transform=transforms.Compose([
                              transforms.PadTrim(133623, 0),
                              transforms.LC2CL()
                          ]))
    loader = DataLoader(dataset=dataset,
                        batch_size=config.batch_size,
                        shuffle=True,
                        num_workers=config.num_workers)
    return loader
예제 #7
0
    def test_compose(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)
        maxminmax = np.abs([audio_orig.min(),
                            audio_orig.max()]).max().astype(np.float)

        tset = (transforms.Scale(factor=maxminmax),
                transforms.PadTrim(max_len=length_new))
        result = transforms.Compose(tset)(audio_orig)

        self.assertTrue(np.abs([result.min(), result.max()]).max() == 1.)

        self.assertTrue(result.size(0) == length_new)
예제 #8
0
    def test_compose(self):

        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
        length_new = int(length_orig * 1.2)
        maxminmax = max(abs(audio_orig.min()), abs(audio_orig.max())).item()

        tset = (transforms.Scale(factor=maxminmax),
                transforms.PadTrim(max_len=length_new, channels_first=False))
        result = transforms.Compose(tset)(audio_orig)

        self.assertTrue(max(abs(result.min()), abs(result.max())) == 1.)

        self.assertTrue(result.size(0) == length_new)

        repr_test = transforms.Compose(tset)
        self.assertTrue(repr_test.__repr__())
예제 #9
0
 def test4(self):
     ds = AUDIOSET(self.bdir)
     T = transforms.Compose([
         transforms.PadTrim(ds.maxlen),
     ])
     TT = mgc_transforms
     vx.transform = T
     dl = data.DataLoader(vx, collate_fn=bce_collate, batch_size=5)
     total_train = 0
     for i, (mb, l) in enumerate(dl):
         total_train += l.size(0)
         if i == 2:
             #ds.set_split("valid")
             total_valid = 0
             for mb_valid, l_valid in dl:
                 total_valid += l_valid.size(0)
             print(total_valid)
     print(total_train)
예제 #10
0
 def test3(self):
     """
         Test that the data loader does transforms
     """
     ds = AUDIOSET(self.bdir, randomize=True)
     T = transforms.Compose([
         transforms.PadTrim(ds.maxlen),
         mgc_transforms.MEL(),
         mgc_transforms.BLC2CBL()
     ])
     TT = mgc_transforms.BinENC(ds.labels_dict)
     ds.transform = T
     ds.target_transform = TT
     dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5)
     labels_total = 0
     print(ds.labels_dict)
     for i, (a, b) in enumerate(dl):
         print(a.size(), b.size())
         if i > 10: break
예제 #11
0
 def test5(self):
     import numpy as np
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="prompts",
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
     ])
     TT = spl_transforms.WC()
     vx.transform = T
     vx.target_transform = TT
     print(vx.splits)
     dl = data.DataLoader(vx, batch_size=5, collate_fn=basic_collate)
     max_wc = 0
     wc_all = []
     for i, (mb, tgts) in enumerate(dl):
         sorted(tgts)
         max_wc = tgts[-1] if tgts[-1] > max_wc else max_wc
         wc_all.extend(tgts)
     print(np.histogram(wc_all, bins=20), len(wc_all))
예제 #12
0
 def test3(self):
     vx = VOXFORGE(self.bdir,
                   download=False,
                   label_type="lang",
                   num_zips=10,
                   randomize=False,
                   split="valid",
                   dev_mode=False)
     vx.find_max_len()
     T = transforms.Compose([
         transforms.PadTrim(vx.maxlen),
         spl_transforms.MEL(),
         spl_transforms.BLC2CBL()
     ])
     TT = spl_transforms.LENC(vx.LABELS)
     vx.transform = T
     vx.target_transform = TT
     dl = data.DataLoader(vx, batch_size=5)
     labels_total = 0
     for i, (a, b) in enumerate(dl):
         labels_total += b.sum()
     print((len(vx) - labels_total) / len(vx))
예제 #13
0
파일: test_size.py 프로젝트: dhpollack/mgc
    def test1(self):
        """
            Test that the data loader does transforms
        """

        NMELS = 224

        ds = AUDIOSET(self.bdir, randomize=True)
        T = transforms.Compose([
            transforms.PadTrim(ds.maxlen),
            mgc_transforms.MEL(n_mels=NMELS),
            mgc_transforms.BLC2CBL()
        ])
        TT = mgc_transforms.BinENC(ds.labels_dict)
        ds.transform = T
        ds.target_transform = TT
        dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5)
        labels_total = 0
        for i, (a, b) in enumerate(dl):
            print(a.size(), b.size())
            break

        self.assertTrue(a.size()[-2:] == (NMELS, 313))
예제 #14
0
## Setting seed
import random

param.seed = param.seed or random.randint(1, 10000)
print("Random Seed: " + str(param.seed))
print("Random Seed: " + str(param.seed), file=log_output)
random.seed(param.seed)
torch.manual_seed(param.seed)
if param.cuda:
    torch.cuda.manual_seed_all(param.seed)

## Transforming audio files
trans = transf.Compose([
    transf.Scale(),  # This makes it into [-1,1]
    # transf.ToTensor(),
    transf.PadTrim(max_len=param.audio_size),  # I don't know if this is needed
    # This makes it into [-1,1] so tanh will work properly
    # transf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


def load_sound(path):
    tensor_to_load_into = None
    import torchaudio
    out, sample_rate = torchaudio.load(path, tensor_to_load_into)
    return out


## Importing dataset
data = dset.DatasetFolder(root=param.input_folder,
                          transform=trans,
예제 #15
0
    batch_size = args.batch_size
    depth = args.depth
    radixs = [2] * depth
    N = np.prod(radixs)
    channels = args.channels
    lr = args.lr
    steps = args.steps
    c = args.c
    generation_time = args.file_size
    filename = args.outfile

    maxlen = 50000
    print('==> Downloading YesNo Dataset..')
    transform = transforms.Compose(
        [transforms.Scale(),
         transforms.PadTrim(maxlen),
         transforms.MuLawEncoding(quantization_channels=channels)])
    data = torchaudio.datasets.YESNO('./data', download=True, transform=transform)
    data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=True)

    print('==> Building model..')
    net = general_FFTNet(radixs, 128, channels).cuda()

    print(sum(p.numel() for p in net.parameters() if p.requires_grad), "of parameters.")

    optimizer = optim.Adam(net.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    print("Start Training.")
    a = datetime.now().replace(microsecond=0)
예제 #16
0
파일: main.py 프로젝트: jozhang97/WaveApp
def main():
  # Init logger
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
  log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w')
  print_log('save path : {}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
  print_log("Random Seed: {}".format(args.manualSeed), log)
  print_log("python version : {}".format(sys.version.replace('\n', ' ')), log)
  print_log("torch  version : {}".format(torch.__version__), log)
  print_log("cudnn  version : {}".format(torch.backends.cudnn.version()), log)

  # Data loading code
  # Any other preprocessings? http://pytorch.org/audio/transforms.html
  sample_length = 10000
  scale = transforms.Scale()
  padtrim = transforms.PadTrim(sample_length)
  downmix = transforms.DownmixMono()
  transforms_audio = transforms.Compose([
    scale, padtrim, downmix
  ])

  if not os.path.isdir(args.data_path):
    os.makedirs(args.data_path)
  train_dir = os.path.join(args.data_path, 'train')
  val_dir = os.path.join(args.data_path, 'val')

  #Choose dataset to use
  if args.dataset == 'arctic':
    # TODO No ImageFolder equivalent for audio. Need to create a Dataset manually
    train_dataset = Arctic(train_dir, transform=transforms_audio, download=True)
    val_dataset = Arctic(val_dir, transform=transforms_audio, download=True)
    num_classes = 4
  elif args.dataset == 'vctk':
    train_dataset = dset.VCTK(train_dir, transform=transforms_audio, download=True)
    val_dataset = dset.VCTK(val_dir, transform=transforms_audio, download=True)
    num_classes = 10
  elif args.dataset == 'yesno':
    train_dataset = dset.YESNO(train_dir, transform=transforms_audio, download=True)
    val_dataset = dset.YESNO(val_dir, transform=transforms_audio, download=True)
    num_classes = 2
  else:
    assert False, 'Dataset is incorrect'

  train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.workers,
    # pin_memory=True, # What is this?
    # sampler=None     # What is this?
  )
  val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)


  #Feed in respective model file to pass into model (alexnet.py)
  print_log("=> creating model '{}'".format(args.arch), log)
  # Init model, criterion, and optimizer
  # net = models.__dict__[args.arch](num_classes)
  net = AlexNet(num_classes)
  #
  print_log("=> network :\n {}".format(net), log)

  # net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu)))

  # define loss function (criterion) and optimizer
  criterion = torch.nn.CrossEntropyLoss()

  # Define stochastic gradient descent as optimizer (run backprop on random small batch)
  optimizer = torch.optim.SGD(net.parameters(), state['learning_rate'], momentum=state['momentum'],
                weight_decay=state['decay'], nesterov=True)

  #Sets use for GPU if available
  if args.use_cuda:
    net.cuda()
    criterion.cuda()

  recorder = RecorderMeter(args.epochs)
  # optionally resume from a checkpoint
  # Need same python vresion that the resume was in 
  if args.resume:
    if os.path.isfile(args.resume):
      print_log("=> loading checkpoint '{}'".format(args.resume), log)
      if args.ngpu == 0:
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
      else:
        checkpoint = torch.load(args.resume)

      recorder = checkpoint['recorder']
      args.start_epoch = checkpoint['epoch']
      net.load_state_dict(checkpoint['state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer'])
      print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log)
    else:
      print_log("=> no checkpoint found at '{}'".format(args.resume), log)
  else:
    print_log("=> do not use any checkpoint for {} model".format(args.arch), log)

  if args.evaluate:
    validate(val_loader, net, criterion, 0, log, val_dataset)
    return

  # Main loop
  start_time = time.time()
  epoch_time = AverageMeter()

  # Training occurs here
  for epoch in range(args.start_epoch, args.epochs):
    current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule)

    need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch))
    need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs)

    print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:6.4f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \
                + ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log)

    print("One epoch")
    # train for one epoch
    # Call to train (note that our previous net is passed into the model argument)
    train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log, train_dataset)

    # evaluate on validation set
    #val_acc,   val_los   = extract_features(test_loader, net, criterion, log)
    val_acc,   val_los   = validate(val_loader, net, criterion, epoch, log, val_dataset)
    is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc)

    save_checkpoint({
      'epoch': epoch + 1,
      'arch': args.arch,
      'state_dict': net.state_dict(),
      'recorder': recorder,
      'optimizer' : optimizer.state_dict(),
    }, is_best, args.save_path, 'checkpoint.pth.tar')

    # measure elapsed time
    epoch_time.update(time.time() - start_time)
    start_time = time.time()
    recorder.plot_curve( os.path.join(args.save_path, 'curve.png') )

  log.close()
예제 #17
0
import torch
import torchaudio.datasets as dset
from torchaudio import transforms

transform = transforms.Compose(
    [transforms.Scale(), transforms.PadTrim(100000)])

train_dataset = dset.YESNO("data", transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=10,
)

for i, (input, target) in enumerate(train_loader):
    import ipdb
    ipdb.set_trace(context=21)
    print("HI")
""" Vision MNIST test"""
"""
import torchvision.datasets as vdset
from torchvision import transforms as vtransforms

transform = vtransforms.Compose([
        vtransforms.ToTensor()
        ])

mnist = vdset.MNIST("data", transform=transform, download=True)

mnist_loader = torch.utils.data.DataLoader(
  mnist,
  batch_size=10,
예제 #18
0
low_mel_freq = 0
high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
mel_pts = np.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2)
hz_pts = np.floor(700 * (10**(mel_pts / 2595) - 1))
bins = np.floor((n_fft + 1) * hz_pts / sr)

# data transformations
td = {
    "RfftPow": RfftPow(n_fft),
    "FilterBanks": FilterBanks(n_filterbanks, bins),
    "MFCC": MFCC(n_filterbanks, n_coefficients),
}

transforms = tat.Compose([
    tat.Scale(),
    tat.PadTrim(58000, fill_value=1e-8),
    Preemphasis(),
    Sig2Features(ws, hs, td),
])

# set network parameters
use_cuda = torch.cuda.is_available()
batch_size = args.batch_size
input_features = 26
hidden_size = 100
output_size = 3
#output_length = (8 + 7 + 2) # with "blanks"
output_length = 8  # without blanks
n_layers = 1
attn_modus = "dot"
예제 #19
0
def evaluate():
  num_classes = 4

  # Init logger
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
  log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w')
  print_log('save path : {}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
  print_log("Random Seed: {}".format(args.manualSeed), log)
  print_log("python version : {}".format(sys.version.replace('\n', ' ')), log)
  print_log("torch  version : {}".format(torch.__version__), log)
  print_log("cudnn  version : {}".format(torch.backends.cudnn.version()), log)

  # Any other preprocessings? http://pytorch.org/audio/transforms.html
  sample_length = 10000
  scale = transforms.Scale()
  padtrim = transforms.PadTrim(sample_length)
  transforms_audio = transforms.Compose([
    scale, padtrim
  ])


  # Data loading
  fs, data = wavfile.read(args.file_name)
  data = torch.from_numpy(data).float()
  data = data.unsqueeze(1)
  audio = transforms_audio(data)
  audio = Variable(audio)
  audio = audio.view(1, -1)
  audio = audio.unsqueeze(0)


  #Feed in respective model file to pass into model (alexnet.py)
  print_log("=> creating model '{}'".format(args.arch), log)

  # Init model, criterion, and optimizer
  # net = models.__dict__[args.arch](num_classes)
  net = AlexNet(num_classes)
  print_log("=> network :\n {}".format(net), log)


  #Sets use for GPU if available
  if args.use_cuda:
    net.cuda()

  # optionally resume from a checkpoint
  # Need same python version that the resume was in
  if args.resume:
    if os.path.isfile(args.resume):
      print_log("=> loading checkpoint '{}'".format(args.resume), log)
      if args.ngpu == 0:
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
      else:
        checkpoint = torch.load(args.resume)

      recorder = checkpoint['recorder']
      args.start_epoch = checkpoint['epoch']
      net.load_state_dict(checkpoint['state_dict'])
      print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log)
    else:
      print_log("=> no checkpoint found at '{}'".format(args.resume), log)
  else:
    print_log("=> do not use any checkpoint for {} model".format(args.arch), log)

  net.eval()
  if args.use_cuda:
    audio = audio.cuda()
  output = net(audio)
  print(output)
  # TODO postprocess output to a string representing the person speaking
  # ouptut = val_dataset.postprocess_target(output)
  return