示例#1
0
def infer(args):
    system_info.print_system_info()

    # Prepare model
    in_size = feature.get_input_dim(args.frame_size, args.context_size,
                                    args.input_transform)

    if args.model_type == 'Transformer':
        model = TransformerDiarization(
            in_size,
            n_units=args.hidden_size,
            n_heads=args.transformer_encoder_n_heads,
            n_layers=args.transformer_encoder_n_layers,
            dropout=0,
            alpha=0)
    else:
        raise ValueError('Unknown model type.')

    serializers.load_npz(args.model_file, model)

    if args.gpu >= 0:
        gpuid = use_single_gpu()
        model.to_gpu()

    kaldi_obj = kaldi_data.KaldiData(args.data_dir)
    for recid in kaldi_obj.wavs:
        data, rate = kaldi_obj.load_wav(recid)
        Y = feature.stft(data, args.frame_size, args.frame_shift)
        Y = feature.transform(Y, transform_type=args.input_transform)
        Y = feature.splice(Y, context_size=args.context_size)
        Y = Y[::args.subsampling]
        out_chunks = []
        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            hs = None
            for start, end in _gen_chunk_indices(len(Y), args.chunk_size):
                Y_chunked = Variable(Y[start:end])
                if args.gpu >= 0:
                    Y_chunked.to_gpu(gpuid)
                hs, ys = model.estimate_sequential(hs, [Y_chunked])
                if args.gpu >= 0:
                    ys[0].to_cpu()
                out_chunks.append(ys[0].data)
                if args.save_attention_weight == 1:
                    att_fname = f"{recid}_{start}_{end}.att.npy"
                    att_path = os.path.join(args.out_dir, att_fname)
                    model.save_attention_weight(att_path)
        outfname = recid + '.h5'
        outpath = os.path.join(args.out_dir, outfname)
        if hasattr(model, 'label_delay'):
            outdata = shift(np.vstack(out_chunks), (-model.label_delay, 0))
        else:
            outdata = np.vstack(out_chunks)
        with h5py.File(outpath, 'w') as wf:
            wf.create_dataset('T_hat', data=outdata)
示例#2
0
 def get_example(self, i):
     rec, st, ed = self.chunk_indices[i]
     Y, T = feature.get_labeledSTFT(
         self.data,
         rec,
         st,
         ed,
         self.frame_size,
         self.frame_shift,
         self.n_speakers)
     Y = feature.transform(Y, self.input_transform)
     Y_spliced = feature.splice(Y, self.context_size)
     Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)
     return Y_ss, T_ss
示例#3
0
    def __getitem__(self, i):
        rec, st, ed = self.chunk_indices[i]
        Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size,
                                       self.frame_shift, self.n_speakers)
        # Y: (frame, num_ceps)
        Y = feature.transform(Y, self.input_transform)
        # Y_spliced: (frame, num_ceps * (context_size * 2 + 1))
        Y_spliced = feature.splice(Y, self.context_size)
        # Y_ss: (frame / subsampling, num_ceps * (context_size * 2 + 1))
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        Y_ss = torch.from_numpy(Y_ss).float()
        T_ss = torch.from_numpy(T_ss).float()
        return Y_ss, T_ss
示例#4
0
    def get_example(self, i):
        rec, st, ed = self.chunk_indices[i]
        n_speakers, Y, T = feature.get_labeledSTFT(
            self.data, rec, st, ed, self.frame_size, self.frame_shift, None
        )  # modification for number of speakers else it was -> self.n_speakers)
        Y = feature.transform(Y, self.input_transform)
        Y_spliced = feature.splice(Y, self.context_size)
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        # here let self.n_speakers represent the max number of speakers in the train set
        # Pad T_ss to concatenate successfully when training using _convert
        T_ss = np.pad(T_ss, ((0, 0), (0, self.n_speakers - T_ss.shape[1])),
                      'constant',
                      constant_values=0)

        return n_speakers, Y_ss, T_ss
    def get_example(self, i):
        rec, st, ed = self.chunk_indices[i]
        Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size,
                                       self.frame_shift, self.n_speakers)
        Y = feature.transform(Y, self.input_transform)
        Y_spliced = feature.splice(Y, self.context_size)
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        # If the sample contains more than "self.n_speakers" speakers,
        #  extract top-(self.n_speakers) speakers
        if self.n_speakers and T_ss.shape[1] > self.n_speakers:
            selected_speakers = np.argsort(
                T_ss.sum(axis=0))[::-1][:self.n_speakers]
            T_ss = T_ss[:, selected_speakers]

        # If self.shuffle is True, shuffle the order in time-axis
        # This operation improves the performance of EEND-EDA
        if self.shuffle:
            order = np.arange(Y_ss.shape[0])
            np.random.shuffle(order)
            Y_ss = Y_ss[order]
            T_ss = T_ss[order]

        return Y_ss, T_ss