示例#1
0
    def data_loaders(self, batch_size):
        train_dataset = SpectrogramDataset(audio_conf=self.audio_conf,
                                           manifest_filepath=self.train_manifest,
                                           labels=self.labels,
                                           normalize=True,
                                           speed_volume_perturb=self.speed_volume_perturb,
                                           spec_augment=self.spec_augment)
        test_dataset = SpectrogramDataset(audio_conf=self.audio_conf,
                                          manifest_filepath=self.val_manifest,
                                          labels=self.labels,
                                          normalize=True,
                                          speed_volume_perturb=False,
                                          spec_augment=False)
        if not self.distributed:
            print('BucketingSampler')
            self.train_sampler = BucketingSampler(train_dataset, batch_size=batch_size)
        else:
            print('DistributedBucketingSampler')
            self.train_sampler = DistributedBucketingSampler(train_dataset,
                                                             batch_size=batch_size,
                                                             num_replicas=self.args.world_size,
                                                             rank=self.rank)
        self.train_loader = AudioDataLoader(train_dataset,
                                            num_workers=self.args.num_workers,
                                            batch_sampler=self.train_sampler)
        self.test_loader = AudioDataLoader(test_dataset,
                                           batch_size=self.args.batch_size,
                                           num_workers=self.args.num_workers)

        if (not self.no_shuffle and self.start_epoch != 0) or self.no_sorta_grad:
            print("Shuffling batches for the following epochs")
            self.train_sampler.shuffle(self.start_epoch)
示例#2
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json,
                              sample_rate=args.sample_rate,
                              segment_length=args.segment_length)

    cv_dataset = AudioDataset(
        args.valid_json,
        sample_rate=args.sample_rate,
        segment_length=args.segment_length,
    )

    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=args.batch_size,
                                shuffle=args.shuffle,
                                num_workers=args.num_workers)

    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=args.batch_size,
                                num_workers=0)

    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    model = ConvTasNet(args.N,
                       args.L,
                       args.B,
                       args.H,
                       args.P,
                       args.X,
                       args.R,
                       args.C,
                       norm_type=args.norm_type,
                       causal=args.causal,
                       mask_nonlinear=args.mask_nonlinear)
    print(model)
    if args.use_cuda:
        model = torch.nn.DataParallel(model)
        model.cuda()
    # optimizer
    lr = args.lr / args.batch_per_step
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#3
0
def main(args):

    # data
    tr_dataset = AudioDataset('tr',
                              batch_size=args.batch_size,
                              sample_rate=args.sample_rate,
                              nmic=args.mic)
    cv_dataset = AudioDataset('val',
                              batch_size=args.batch_size,
                              sample_rate=args.sample_rate,
                              nmic=args.mic)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                shuffle=args.shuffle,
                                num_workers=0)  #num_workers=0 for PC
    cv_loader = AudioDataLoader(cv_dataset, batch_size=1,
                                num_workers=0)  #num_workers=0 for PC

    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}

    # model
    model = FaSNet_TAC(enc_dim=args.enc_dim,
                       feature_dim=args.feature_dim,
                       hidden_dim=args.hidden_dim,
                       layer=args.layer,
                       segment_size=args.segment_size,
                       nspk=args.nspk,
                       win_len=args.win_len,
                       context_len=args.context_len,
                       sr=args.sample_rate)

    k = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('# of parameters:', k)

    #print(model)
    if args.use_cuda:
        model = torch.nn.DataParallel(model)
        model.cuda()

    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#4
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_dir,
                              args.batch_size,
                              sample_rate=args.sample_rate,
                              segment=args.segment)
    cv_dataset = AudioDataset(
        args.valid_dir,
        batch_size=1,  # 1 -> use less GPU memory to do cv
        sample_rate=args.sample_rate,
        segment=-1,
        cv_maxlen=args.cv_maxlen)  # -1 -> use full audio
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                shuffle=args.shuffle,
                                num_workers=4)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=4,
                                pin_memory=True)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    model = ConvTasNet(args.N,
                       args.L,
                       args.B,
                       args.H,
                       args.P,
                       args.X,
                       args.R,
                       args.C,
                       norm_type=args.norm_type,
                       causal=args.causal,
                       mask_nonlinear=args.mask_nonlinear)
    if args.use_cuda:
        model = torch.nn.DataParallel(model)
        model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#5
0
文件: train.py 项目: JJoving/SMLAT
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    encoder = Encoder(args.einput,
                      args.ehidden,
                      args.elayer,
                      dropout=args.edropout,
                      bidirectional=args.ebidirectional,
                      rnn_type=args.etype)
    decoder = Decoder(vocab_size,
                      args.dembed,
                      sos_id,
                      eos_id,
                      args.dhidden,
                      args.dlayer,
                      bidirectional_encoder=args.ebidirectional)
    model = Seq2Seq(encoder, decoder)
    print(model)
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    ctc = 0
    solver = Solver(data, model, optimizier, args)
    solver.train()
def separate(model, dataset, output_dir, sr=8000):

    model.to(device)
    model.eval()

    # Load data
    dataLoader = AudioDataLoader(dataset, batch_size=1)

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    with torch.no_grad():
        for i, (mixture, name) in enumerate(dataLoader):
            # Get batch data
            mixture = mixture.to(device)
            # Forward
            estimate_source = model(mixture).squeeze(0)  # [B, C, T]

            # Write result
            filename = os.path.join(output_dir, name.strip('.wav'))
            librosa.output.write_wav(f'{filename}.wav',
                                     mixture.squeeze(0).cpu().numpy(), sr)
            C = estimate_source.size(0)
            for c in range(C):
                librosa.output.write_wav(f'{filename}_s{c + 1}.wav',
                                         estimate_source[c].cpu().numpy(), sr)
示例#7
0
def main(args):
    # Construct Solver

    # data
    tr_dataset = AudioDataset(args.tr_json,
                              sample_rate=args.sample_rate,
                              segment=args.segment,
                              drop=args.drop)
    cv_dataset = AudioDataset(args.cv_json,
                              sample_rate=args.sample_rate,
                              drop=0,
                              segment=-1)  # -1 -> use full audio
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=args.batch_size,
                                shuffle=args.shuffle,
                                num_workers=args.num_workers)
    cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}

    # model
    # N=512, L=32, B=128, Sc=128, H=512, X=8, R=3, P=3, C=2
    model = ConvTasNet(args.N, args.L, args.B, args.Sc, args.H, args.X, args.R,
                       args.P, args.C)

    print(model)
    if args.use_cuda:
        os.environ["CUDA_VISIBLE_DEVICES"] = '5,6,7'
        model = torch.nn.DataParallel(model)
        model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_dir,
                              args.batch_size,
                              sample_rate=args.sample_rate,
                              segment=args.segment)
    cv_dataset = AudioDataset(
        args.valid_dir,
        batch_size=1,  # 1 -> use less GPU memory to do cv
        sample_rate=args.sample_rate,
        segment=-1,
        cv_maxlen=args.cv_maxlen)  # -1 -> use full audio
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                shuffle=args.shuffle,
                                num_workers=args.num_workers)
    cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    model = DPTNet(args.N, args.C, args.L, args.H, args.K, args.B)
    #print(model)
    k = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('# of parameters:', k)

    if args.use_cuda:
        os.environ["CUDA_VISIBLE_DEVICES"] = '5,6,7'
        model = torch.nn.DataParallel(model)
        model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#9
0
文件: train.py 项目: entn-at/TasNet
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_dir,
                              args.batch_size,
                              sample_rate=args.sample_rate,
                              L=args.L)
    cv_dataset = AudioDataset(args.valid_dir,
                              args.batch_size,
                              sample_rate=args.sample_rate,
                              L=args.L)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                shuffle=args.shuffle,
                                num_workers=args.num_workers)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                shuffle=args.shuffle,
                                num_workers=args.num_workers)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    model = TasNet(args.L,
                   args.N,
                   args.hidden_size,
                   args.num_layers,
                   bidirectional=args.bidirectional,
                   nspk=args.nspk)
    print(model)
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#10
0
def evaluate(args):
    total_SISNRi = 0
    total_SDRi = 0
    total_cnt = 0

    # Load model
    model = ConvTasNet.load_model(args.model_path)
    print(model)
    model.eval()
    #if args.use_cuda:
    if True:
        model.cuda()

    # Load data
    dataset = AudioDataset(args.data_dir,
                           args.batch_size,
                           sample_rate=args.sample_rate,
                           segment=-1)
    data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2)

    with torch.no_grad():
        for i, (data) in enumerate(data_loader):
            # Get batch data
            padded_mixture, mixture_lengths, padded_source = data
            #if args.use_cuda:
            if True:
                padded_mixture = padded_mixture.cuda()
                mixture_lengths = mixture_lengths.cuda()
                padded_source = padded_source.cuda()
            # Forward
            estimate_source = model(padded_mixture)  # [B, C, T]
            loss, max_snr, estimate_source, reorder_estimate_source = \
                cal_loss(padded_source, estimate_source, mixture_lengths)
            # Remove padding and flat
            mixture = remove_pad(padded_mixture, mixture_lengths)
            source = remove_pad(padded_source, mixture_lengths)
            # NOTE: use reorder estimate source
            estimate_source = remove_pad(reorder_estimate_source,
                                         mixture_lengths)
            # for each utterance
            for mix, src_ref, src_est in zip(mixture, source, estimate_source):
                print("Utt", total_cnt + 1)
                # Compute SDRi
                if args.cal_sdr:
                    avg_SDRi = cal_SDRi(src_ref, src_est, mix)
                    total_SDRi += avg_SDRi
                    print("\tSDRi={0:.2f}".format(avg_SDRi))
                # Compute SI-SNRi
                avg_SISNRi = cal_SISNRi(src_ref, src_est, mix)
                print("\tSI-SNRi={0:.2f}".format(avg_SISNRi))
                total_SISNRi += avg_SISNRi
                total_cnt += 1
    if args.cal_sdr:
        print("Average SDR improvement: {0:.2f}".format(total_SDRi /
                                                        total_cnt))
    print("Average SISNR improvement: {0:.2f}".format(total_SISNRi /
                                                      total_cnt))
示例#11
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_dir, args.batch_size,
                              sample_rate=args.sample_rate, segment=args.segment)
    cv_dataset = AudioDataset(args.valid_dir, batch_size=1,  # 1 -> use less GPU memory to do cv
                              sample_rate=args.sample_rate,
                              segment=-1, cv_maxlen=args.cv_maxlen)  # -1 -> use full audio
    tr_loader = AudioDataLoader(tr_dataset, batch_size=1,
                                shuffle=args.shuffle)
    cv_loader = AudioDataLoader(cv_dataset, batch_size=1)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}

    # model
    # model = FURCA(args.W, args.N, args.K, args.C, args.D, args.H, args.E,
    #                    norm_type=args.norm_type, causal=args.causal,
    #                    mask_nonlinear=args.mask_nonlinear)
    model = FaSNet_base(enc_dim=256, feature_dim=64, hidden_dim=128, layer=6, segment_size=250, nspk = 2, win_len = 2)

    print(model)
    if args.use_cuda:
        # model = torch.nn.DataParallel(model)
        model.cuda()
        #model.to(device)
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
def main(train_dir,batch_size,sample_rate, segment,valid_dir,cv_maxlen,shuffle,num_workers,N, L, B, H, P, X, R, C,norm_type, causal, mask_nonlinear,use_cuda,optimizer,lr,momentum,l2):
     # Construct Solver
    # data
    tr_dataset = AudioDataset(train_dir, batch_size,
                              sample_rate=sample_rate, segment=segment)
    cv_dataset = AudioDataset(valid_dir, batch_size=1,  # 1 -> use less GPU memory to do cv
                              sample_rate=sample_rate,
                              segment=-1, cv_maxlen=cv_maxlen)  # -1 -> use full audio
    tr_loader = AudioDataLoader(tr_dataset, batch_size=1,
                                shuffle=shuffle,
                                num_workers=num_workers)
    cv_loader = AudioDataLoader(cv_dataset, batch_size=1,
                                num_workers=0)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    model = ConvTasNet(N, L, B, H, P, X, R, C, 
                       norm_type=norm_type, causal=causal,
                       mask_nonlinear=mask_nonlinear)
    print(model)
    if use_cuda:
        model = torch.nn.DataParallel(model)
        model.cuda()
    # optimizer
    if optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                     lr=lr,
                                     momentum=momentum,
                                     weight_decay=l2)
    elif optimizer == 'adam':
     #fatemeh: change optimizier to optimizer
        optimizer = torch.optim.Adam(model.parameters(),
                                      lr=lr,
                                      weight_decay=l2)
    else:
        print("Not support optimizer")
        return

    # solver
    solver = Solver(data, model, optimizer, use_cuda,epochs,half_lr,early_stop,max_norm,save_folder,checkpoint,continue_from,model_path,print_freq,visdom,visdom_epoch,visdom_id)
    solver.train()
def evaluate(model, dataset, batch_size=2, verbose=1, cal_sdr=False):
    total_SISNRi = 0
    total_SDRi = 0
    total_cnt = 0

    model.eval()
    model.to(device)

    data_loader = AudioDataLoader(dataset,
                                  batch_size=batch_size,
                                  shuffle=False)

    with torch.no_grad():
        for i, (audio, mixture_lengths) in enumerate(data_loader):
            # Get batch data
            padded_mixture = audio[:, 0]
            padded_source = audio[:, 1:]

            padded_mixture = padded_mixture.to(device)
            mixture_lengths = mixture_lengths.to(device)
            padded_source = padded_source.to(device)

            # Forward
            estimate_source = model(padded_mixture)  # [B, C, T]
            loss, max_snr, estimate_source, reorder_estimate_source = \
                cal_loss(padded_source, estimate_source, mixture_lengths)

            # Remove padding and flat
            mixture = remove_pad(padded_mixture, mixture_lengths)
            source = remove_pad(padded_source, mixture_lengths)
            # NOTE: use reorder estimate source
            estimate_source = remove_pad(reorder_estimate_source,
                                         mixture_lengths)
            # for each utterance
            for mix, src_ref, src_est in zip(mixture, source, estimate_source):
                if verbose == 1: print("Utt", total_cnt + 1)
                # Compute SDRi
                if cal_sdr:
                    avg_SDRi = cal_SDRi(src_ref, src_est, mix)
                    total_SDRi += avg_SDRi
                    if verbose == 1: print(f"\tSDRi={avg_SDRi:.{2}}")

                # Compute SI-SNRi
                avg_SISNRi = cal_SISNRi(src_ref, src_est, mix)
                if verbose == 1: print(f"\tSI-SNRi={avg_SISNRi:.{2}}")
                total_SISNRi += avg_SISNRi
                total_cnt += 1

    if cal_sdr:
        print(f"Average SDR improvement: {total_SDRi / total_cnt:.{2}}")
    print(f"Average SISNR improvement: {total_SISNRi / total_cnt:.{2}}")
示例#14
0
    num_batches = 10
    num_workers = 2
    batch_frames = 2000

    # test batch_frames
    train_dataset = AudioDataset(
        train_json, batch_size, max_length_in, max_length_out, num_batches,
        batch_frames=batch_frames)
    for i, minibatch in enumerate(train_dataset):
        print(i)
        print(minibatch)
    exit(0)

    # test
    train_dataset = AudioDataset(
        train_json, batch_size, max_length_in, max_length_out, num_batches)
    # NOTE: must set batch_size=1 here.
    train_loader = AudioDataLoader(
        train_dataset, batch_size=1, num_workers=num_workers, LFR_m=4, LFR_n=3)

    import torch
    #torch.set_printoptions(threshold=10000000)
    for i, (data) in enumerate(train_loader):
        inputs, inputs_lens, targets = data
        print(i)
        # print(inputs)
        print(inputs.size())
        print(inputs_lens)
        # print(targets)
        print("*"*20)
def evaluate(args):
    total_SISNRi = 0
    total_SDRi = 0
    total_cnt = 0

    # Load model
    model = DPTNet(args.N, args.C, args.L, args.H, args.K, args.B)

    if args.use_cuda:
        model = torch.nn.DataParallel(model)
        model.cuda()

    # model.load_state_dict(torch.load(args.model_path, map_location='cpu'))

    model_info = torch.load(args.model_path)

    state_dict = OrderedDict()
    for k, v in model_info['model_state_dict'].items():
        name = k.replace("module.", "")  # remove 'module.'
        state_dict[name] = v
    model.load_state_dict(state_dict)

    print(model)

    # Load data
    dataset = AudioDataset(args.data_dir,
                           args.batch_size,
                           sample_rate=args.sample_rate,
                           segment=-1)
    data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2)

    with torch.no_grad():
        for i, (data) in enumerate(data_loader):
            # Get batch data
            padded_mixture, mixture_lengths, padded_source = data
            if args.use_cuda:
                padded_mixture = padded_mixture.cuda()
                mixture_lengths = mixture_lengths.cuda()
                padded_source = padded_source.cuda()
            # Forward
            estimate_source = model(padded_mixture)  # [B, C, T]
            loss, max_snr, estimate_source, reorder_estimate_source = \
                cal_loss(padded_source, estimate_source, mixture_lengths)
            # Remove padding and flat
            mixture = remove_pad(padded_mixture, mixture_lengths)
            source = remove_pad(padded_source, mixture_lengths)
            # NOTE: use reorder estimate source
            estimate_source = remove_pad(reorder_estimate_source,
                                         mixture_lengths)
            # for each utterance
            for mix, src_ref, src_est in zip(mixture, source, estimate_source):
                print("Utt", total_cnt + 1)
                # Compute SDRi
                if args.cal_sdr:
                    avg_SDRi = cal_SDRi(src_ref, src_est, mix)
                    total_SDRi += avg_SDRi
                    print("\tSDRi={0:.2f}".format(avg_SDRi))
                # Compute SI-SNRi
                avg_SISNRi = cal_SISNRi(src_ref, src_est, mix)
                print("\tSI-SNRi={0:.2f}".format(avg_SISNRi))
                total_SISNRi += avg_SISNRi
                total_cnt += 1
    if args.cal_sdr:
        print("Average SDR improvement: {0:.2f}".format(total_SDRi /
                                                        total_cnt))
    print("Average SISNR improvement: {0:.2f}".format(total_SISNRi /
                                                      total_cnt))
示例#16
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json,
                              args.batch_size,
                              args.maxlen_in,
                              args.maxlen_out,
                              batch_frames=args.batch_frames)
    cv_dataset = AudioDataset(args.valid_json,
                              args.batch_size,
                              args.maxlen_in,
                              args.maxlen_out,
                              batch_frames=args.batch_frames)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                shuffle=args.shuffle,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    encoder = Encoder(args.d_input * args.LFR_m,
                      args.n_layers_enc,
                      args.n_head,
                      args.d_k,
                      args.d_v,
                      args.d_model,
                      args.d_inner,
                      dropout=args.dropout,
                      pe_maxlen=args.pe_maxlen)
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen)
    model = Transformer(encoder, decoder)
    print(model)
    model.cuda()
    # optimizer
    model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    optimizier = TransformerOptimizer(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        args.k, args.d_model, args.warmup_steps)

    # solver
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#17
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out)
    tr_loader = AudioDataLoader(tr_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    cv_loader = AudioDataLoader(cv_dataset,
                                batch_size=1,
                                num_workers=args.num_workers,
                                LFR_m=args.LFR_m,
                                LFR_n=args.LFR_n,
                                align_trun=args.align_trun)
    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    args.char_list = char_list
    vocab_size = len(char_list)
    data = {'tr_loader': tr_loader, 'cv_loader': cv_loader}
    # model
    #import pdb
    #pdb.set_trace()
    encoder = Encoder(args.einput * args.LFR_m,
                      args.ehidden,
                      args.elayer,
                      vocab_size,
                      dropout=args.edropout,
                      bidirectional=args.ebidirectional,
                      rnn_type=args.etype)
    decoder = Decoder(vocab_size,
                      args.dembed,
                      sos_id,
                      eos_id,
                      args.dhidden,
                      args.dlayer,
                      args.offset,
                      args.atype,
                      dropout=args.edropout,
                      bidirectional_encoder=args.ebidirectional)
    if args.ebidirectional:
        eprojs = args.ehidden * 2
    else:
        eprojs = args.ehidden
    ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout)
    #lstm_model = Lstmctc.load_model(args.continue_from)

    model = Seq2Seq(encoder, decoder, ctc, args)
    #model_dict = model.state_dict()
    print(model)
    #print(lstm_model)
    #pretrained_dict = torch.load(args.ctc_model)
    #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict}
    #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict}
    #model_dict.update(pretrained_dict)
    #model.load_state_dict(model_dict)
    #for k,v in model.named_parameters():
    #    if k.startswith("encoder"):
    #        print(k)
    #        v.requires_grad=False
    model.cuda()
    # optimizer
    if args.optimizer == 'sgd':
        optimizier = torch.optim.SGD(model.parameters(),
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.l2)
    elif args.optimizer == 'adam':
        optimizier = torch.optim.Adam(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.l2)
    else:
        print("Not support optimizer")
        return

    # solver
    ctc = 0
    solver = Solver(data, model, optimizier, args)
    solver.train()
示例#18
0
 def __init__(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     self.train_json = os.path.join(dir_path, self.train_json)
     self.valid_json = os.path.join(dir_path, self.valid_json)
     self.dict_txt = os.path.join(dir_path, self.dict_txt)
     self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt)
     self.vocab_size = len(self.char_list)
     self.tr_dataset = AudioDataset(self.train_json,
                                    self.batch_size,
                                    self.maxlen_in,
                                    self.maxlen_out,
                                    batch_frames=self.batch_frames)
     self.cv_dataset = AudioDataset(self.valid_json,
                                    self.batch_size,
                                    self.maxlen_in,
                                    self.maxlen_out,
                                    batch_frames=self.batch_frames)
     self.tr_loader = AudioDataLoader(self.tr_dataset,
                                      batch_size=1,
                                      num_workers=self.num_workers,
                                      shuffle=self.shuffle,
                                      LFR_m=self.LFR_m,
                                      LFR_n=self.LFR_n)
     self.cv_loader = AudioDataLoader(self.cv_dataset,
                                      batch_size=1,
                                      num_workers=self.num_workers,
                                      LFR_m=self.LFR_m,
                                      LFR_n=self.LFR_n)
     self.data = {'tr_loader': self.tr_loader, 'cv_loader': self.cv_loader}
     self.encoder = Encoder(self.d_input * self.LFR_m,
                            self.n_layers_enc,
                            self.n_head,
                            self.d_k,
                            self.d_v,
                            self.d_model,
                            self.d_inner,
                            dropout=self.dropout,
                            pe_maxlen=self.pe_maxlen)
     self.decoder = Decoder(
         self.sos_id,
         self.eos_id,
         self.vocab_size,
         self.d_word_vec,
         self.n_layers_dec,
         self.n_head,
         self.d_k,
         self.d_v,
         self.d_model,
         self.d_inner,
         dropout=self.dropout,
         tgt_emb_prj_weight_sharing=self.tgt_emb_prj_weight_sharing,
         pe_maxlen=self.pe_maxlen)
     self.tr_loss = torch.Tensor(self.epochs)
     self.cv_loss = torch.Tensor(self.epochs)
     self.model = Transformer(self.encoder, self.decoder)
     self.optimizer = TransformerOptimizer(
         torch.optim.Adam(self.model.parameters(),
                          betas=(0.9, 0.98),
                          eps=1e-09), self.k, self.d_model,
         self.warmup_steps)
     self._reset()
示例#19
0
def main(args):
    # Construct Solver
    # data
    tr_dataset = AudioDataset(
        args.train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        batch_frames=args.batch_frames,
    )
    cv_dataset = AudioDataset(
        args.valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        batch_frames=args.batch_frames,
    )
    tr_loader = AudioDataLoader(
        tr_dataset,
        batch_size=1,
        num_workers=args.num_workers,
        shuffle=args.shuffle,
        LFR_m=args.LFR_m,
        LFR_n=args.LFR_n,
    )
    cv_loader = AudioDataLoader(
        cv_dataset,
        batch_size=1,
        num_workers=args.num_workers,
        LFR_m=args.LFR_m,
        LFR_n=args.LFR_n,
    )

    # load dictionary and generate char_list, sos_id, eos_id
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    data = {"tr_loader": tr_loader, "cv_loader": cv_loader}

    # model
    encoder = Encoder(
        args.d_input * args.LFR_m,
        args.n_layers_enc,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        pe_maxlen=args.pe_maxlen,
    )
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen,
    )
    model = Transformer(encoder, decoder)

    device = flow.device("cuda")
    model.to(device)

    # optimizer
    optimizier = TransformerOptimizer(
        flow.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        args.k,
        args.d_model,
        args.warmup_steps,
        args.step_num,
    )

    # solver
    solver = Solver(data, model, optimizier, device, args)
    solver.train()
示例#20
0
    train_df = train_df.dropna(how='any')
    print(train_df.head())
    # test_df = pd.read_csv('test_df.csv', names=['id', 'sent'])

    save_file = os.path.join('save', 'chars')
    chars = get_chars('chinese', save_file, train_df)
    char_to_token = {c: i for i, c in enumerate(chars)}
    token_to_char = {i: c for c, i in char_to_token.items()}
    sos_token = char_to_token['<sos>']
    eos_token = char_to_token['<eos>']
    pad_token = char_to_token['<pad>']

    train_dataset = SpeechDataset(train_df, dataset_dir, char_to_token)
    train_loader = AudioDataLoader(pad_token,
                                   train_dataset,
                                   batch_size=32,
                                   shuffle=True,
                                   drop_last=True)

    # #test_dataset = SpeechDataset(test_df, dataset_dir)
    # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    input_size = 128  # num rows in instagram
    hidden_dim = 64  # 256*2 nodes in each LSTM
    num_layers = 3
    dropout = 0.1
    layer_norm = False
    encoder = Listener(input_size,
                       hidden_dim,
                       num_layers,
                       dropout=dropout,
示例#21
0
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    wer, cer, output_data = evaluate(test_loader=test_loader,
                                     device=device,
                                     model=model,
                                     decoder=decoder,
                                     target_decoder=target_decoder,
                                     save_output=args.save_output,
                                     verbose=args.verbose,
                                     half=args.half)

    # print('Test Summary \t'
    #       'Average WER {wer:.3f}\t'
    #       'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
    if args.save_output is not None:
        np.save(args.save_output, output_data)
示例#22
0
def evaluate(args):
    total_SISNRi = 0
    total_SDRi = 0
    total_cnt = 0
    avg_SISNRiPitNum = 0
    length = torch.ones(1)
    length = length.int()
    numberEsti =[]
    # Load model
    model = ConvTasNet.load_model(args.model_path)
 #   print(model)
    model.eval()
    if args.use_cuda:
        model.cuda(0)

    # Load data
    dataset = AudioDataset(args.data_dir, args.batch_size,
                           sample_rate=args.sample_rate, segment=2)
    data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2)

    with torch.no_grad():
        for i, (data) in enumerate(data_loader):
            print(i)
            # Get batch data
            padded_mixture, mixture_lengths, padded_source = data
            if args.use_cuda:
                padded_mixture = padded_mixture.cuda(0)
                mixture_lengths = mixture_lengths.cuda(0)
            # Forward
            estimate_source ,s_embed  = model(padded_mixture)  # [B, C, T],#[B,N,K,E] 
          #  print(estimate_source.shape)
           # embid = (model.separator.network[2][7])(padded_mixture)
          #  print(embid)
            '''
            embeddings = s_embed[0].data.cpu().numpy()
            embedding = (embeddings.reshape((1,-1,20)))[0]
            number = sourceNumEsti2(embedding)
            numberEsti.append(number)
            '''
           # print(estimate_source)
            loss, max_snr, estimate_source, reorder_estimate_source = \
                cal_loss(padded_source, estimate_source, mixture_lengths)
            # Remove padding and flat
            mixture = remove_pad(padded_mixture, mixture_lengths)
            source = remove_pad(padded_source, mixture_lengths)
           # print(max_snr.item())
            # NOTE: use reorder estimate source
            estimate_source = remove_pad(reorder_estimate_source,
                                         mixture_lengths)
           # print((estimate_source[0].shape))
            # for each utterance
            for mix, src_ref, src_est in zip(mixture, source, estimate_source):
                print("Utt", total_cnt + 1)
                # Compute SDRi
                if args.cal_sdr:
                    avg_SDRi = cal_SDRi(src_ref, src_est, mix)
                    total_SDRi += avg_SDRi
                    print("\tSDRi={0:.2f}".format(avg_SDRi))
                # Compute SI-SNRi
                avg_SISNRi = cal_SISNRi(src_ref, src_est, mix)
                #avg_SISNRiPit,a,b = cal_si_snr_with_pit(torch.from_numpy(src_ref), torch.from_numpy(src_est),length)
                print("\tSI-SNRi={0:.2f}".format(avg_SISNRi))
                total_SISNRi += (avg_SISNRi)
                #total_SNRiPitNum += avg_SISNRiPit.numpy()
                total_cnt += 1
            
    if args.cal_sdr:
        print("Average SDR improvement: {0:.2f}".format(total_SDRi / total_cnt))
    print("Average SISNR improvement: {0:.2f}".format(total_SISNRi / total_cnt))
    print("speaker:2 ./ClustertrainTFSE1New/final_paper_2_3_2chobatch6.pth.tar")
   
    return numberEsti