Exemplo n.º 1
0
 def __call__(self, aco_tensor):
     # aco_tensor: [T, cc_order + 2] dimensional, where T are frames
     if apt is None:
         raise ValueError('Please install ahoproc_tools to '
                          'process ahocoder data')
     # voiced frequency is [-2] dim
     fv = aco_tensor[:, -2].contiguous().view(-1, 1)
     fv_interp, uv = apt.interpolation(fv.numpy(), self.fv_k)
     i_fv_t = torch.FloatTensor(fv_interp)
     if self.normalize:
         i_fv_t = i_fv_t / 1000
     # lf0 is [-1] dim
     lf0 = aco_tensor[:, -1].contiguous().view(-1, 1)
     lf0_interp, uv = apt.interpolation(lf0.numpy(), self.lf0_k)
     if np.any(lf0_interp <= self.lf0_k):
         # totally unvoiced segment, put min F0
         lf0_interp = np.log(60) * np.ones(lf0_interp.shape)
         uv = np.zeros(uv.shape)
     i_lf0_t = torch.FloatTensor(lf0_interp)
     uv_t = torch.FloatTensor(np.array(uv, dtype=np.float32))
     # compose final tensor with +1 dim
     aco_tensor = torch.cat((aco_tensor[:, :-2], i_fv_t, i_lf0_t), dim=1)
     if self.stats is not None:
         aco_tensor = self.normalizer(aco_tensor)
     aco_tensor = torch.cat((aco_tensor, uv_t), dim=1)
     return aco_tensor
Exemplo n.º 2
0
    def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        wav = wav.data.numpy()
        max_frames = wav.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            proso = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            proso = proso[:, beg_i:end_i]
            pkg[self.name] = proso
        else:
            # first compute logF0 and voiced/unvoiced flag
            # f0 = pysptk.rapt(wav.astype(np.float32),
            #                 fs=self.sr, hopsize=self.hop,
            #                 min=self.f0_min, max=self.f0_max,
            #                 otype='f0')
            f0 = pysptk.swipe(wav.astype(np.float64),
                              fs=self.sr, hopsize=self.hop,
                              min=self.f0_min,
                              max=self.f0_max,
                              otype='f0')
            # sound = pm.Sound(wav.astype(np.float32), self.sr)
            # f0 = sound.to_pitch(self.hop / 16000).selected_array['frequency']
            if len(f0) < max_frames:
                pad = max_frames - len(f0)
                f0 = np.concatenate((f0, f0[-pad:]), axis=0)
            lf0 = np.log(f0 + 1e-10)
            lf0, uv = interpolation(lf0, -1)
            lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames]
            uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames]
            if torch.sum(uv) == 0:
                # if frame is completely unvoiced, make lf0 min val
                lf0 = torch.ones(uv.size()) * np.log(self.f0_min)
            # assert lf0.min() > 0, lf0.data.numpy()
            # secondly obtain zcr
            zcr = librosa.feature.zero_crossing_rate(y=wav,
                                                     frame_length=self.win,
                                                     hop_length=self.hop)
            zcr = torch.tensor(zcr.astype(np.float32))
            zcr = zcr[:, :max_frames]
            # finally obtain energy
            egy = librosa.feature.rmse(y=wav, frame_length=self.win,
                                       hop_length=self.hop,
                                       pad_mode='constant')
            egy = torch.tensor(egy.astype(np.float32))
            egy = egy[:, :max_frames]
            proso = torch.cat((lf0, uv, egy, zcr), dim=0)
  
            if self.der_order > 0 :
                deltas=[proso]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(proso.numpy(),order=n))
                proso=torch.from_numpy(np.concatenate(deltas))

            pkg[self.name] = proso
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg
Exemplo n.º 3
0
def main(opts):
    for ai, afile in tqdm.tqdm(enumerate(opts.arff_files), total=len(opts.arff_files)):
        with open(afile) as af:
            data = arff.load(af)
            attrs = [at[0] for at in data['attributes']]
            f0_idx = attrs.index('F0_sma')
            data = data['data']
            array = []
            X = []
            for dpoint in data:
                # ignore name, timestamp and class
                f0_val = dpoint[f0_idx]
                if f0_val > 0:
                    dpoint[f0_idx] = np.log(f0_val)
                else:
                    dpoint[f0_idx] = -1e10
                array.append(dpoint[2:-1])
            array = np.array(array, dtype=np.float32)
            lf0, _ = interpolation(array[:, -1], -1e10)
            array[:, -1] = lf0
            if opts.out_stats is not None:
                X.append(array)
            npfile = os.path.splitext(afile)[0]
            np.save(os.path.join(npfile), array.T)
    if opts.out_stats is not None:
        X = np.concatenate(X, axis=0)
        mn = np.mean(X, axis=0)
        sd = np.std(X, axis=0)
        with open(opts.out_stats, 'wb') as out_f:
            pickle.dump({'mean':mn, 'std':sd}, out_f)
Exemplo n.º 4
0
 def __call__(self, pkg, cached_file=None):
     pkg = format_package(pkg)
     wav = pkg['chunk']
     wav = wav.data.numpy()
     max_frames = wav.shape[0] // self.hop
     if cached_file is not None:
         # load pre-computed data
         proso = torch.load(cached_file)
         beg_i = pkg['chunk_beg_i'] // self.hop
         end_i = pkg['chunk_end_i'] // self.hop
         proso = proso[:, beg_i:end_i]
         pkg['prosody'] = proso
     else:
         # first compute logF0 and voiced/unvoiced flag
         f0 = pysptk.swipe(wav.astype(np.float64),
                           fs=self.sr,
                           hopsize=self.hop,
                           min=self.f0_min,
                           max=self.f0_max,
                           otype='f0')
         lf0 = np.log(f0 + 1e-10)
         lf0, uv = interpolation(lf0, -1)
         lf0 = torch.tensor(lf0.astype(
             np.float32)).unsqueeze(0)[:, :max_frames]
         uv = torch.tensor(uv.astype(
             np.float32)).unsqueeze(0)[:, :max_frames]
         if torch.sum(uv) == 0:
             # if frame is completely unvoiced, make lf0 min val
             lf0 = torch.ones(uv.size()) * np.log(self.f0_min)
         assert lf0.min() > 0, lf0.data.numpy()
         # secondly obtain zcr
         zcr = librosa.feature.zero_crossing_rate(y=wav,
                                                  frame_length=self.win,
                                                  hop_length=self.hop)
         zcr = torch.tensor(zcr.astype(np.float32))
         zcr = zcr[:, :max_frames]
         # finally obtain energy
         egy = librosa.feature.rmse(y=wav,
                                    frame_length=self.win,
                                    hop_length=self.hop,
                                    pad_mode='constant')
         egy = torch.tensor(egy.astype(np.float32))
         egy = egy[:, :max_frames]
         proso = torch.cat((lf0, uv, egy, zcr), dim=0)
         pkg['prosody'] = proso
     return pkg
Exemplo n.º 5
0
    def __call__(self, tensor):
        """

        Args:
            tensor (Tensor): Tensor of audio of size (samples x 1)

        """
        # pysptk and interpolate are a MUST in this transform
        import pysptk
        from ahoproc_tools.interpolate import interpolation
        t_npy = tensor.cpu().squeeze(1).numpy()
        #print('t_npy shape: ', t_npy.shape)
        seqlen = t_npy.shape[0]
        T = seqlen // self.hop_length
        # compute LF0 and UV
        f0 = pysptk.swipe(t_npy.astype(np.float64),
                          fs=self.sr,
                          hopsize=self.hop_length,
                          min=60,
                          max=240,
                          otype="f0")[:T]
        lf0 = np.log(f0 + 1e-10)
        lf0, uv = interpolation(lf0, -1)
        if np.any(lf0 == np.log(1e-10)):
            # all lf0 goes to minf0 as a PAD symbol
            lf0 = np.ones(lf0.shape) * np.log(60)
            # all frames are unvoiced
            uv = np.zeros(uv.shape)
        ret = {
            'lf0': torch.FloatTensor(lf0).view(-1, 1),
            'uv': torch.FloatTensor(uv.astype(np.float32)).view(-1, 1)
        }
        tot_frames = T

        # MelSpectrum and MFCCs
        mel = self.mel(tensor).transpose(0, 1).squeeze(2)
        # do compression?
        if self.dynamic_norm_spec:
            mel = torch.log1p(mel * 10000) / torch.log(torch.FloatTensor([10]))
        ret['mel_spec'] = mel[:tot_frames]
        mfcc = librosa.feature.mfcc(y=t_npy,
                                    sr=self.sr,
                                    n_fft=self.n_fft,
                                    hop_length=self.hop_length,
                                    n_mfcc=self.mfcc_order).T
        mfcc = mfcc[:tot_frames]
        ret['mfcc'] = torch.FloatTensor(mfcc)
        # Spectrogram abs magnitude [dB]
        spec = librosa.stft(t_npy,
                            n_fft=self.n_fft,
                            hop_length=self.hop_length,
                            win_length=self.win_length,
                            window=self.window)
        spec_db = librosa.amplitude_to_db(spec).T
        spec_ang = np.angle(spec).T
        spec_db = spec_db[:tot_frames]
        spec_ang = spec_ang[:tot_frames]
        ret['mag'] = torch.FloatTensor(spec_db)
        ret['pha'] = torch.FloatTensor(spec_ang)
        # ZCR, E and lF0
        egy = librosa.feature.rmse(y=t_npy,
                                   frame_length=self.win_length,
                                   hop_length=self.hop_length,
                                   pad_mode='constant').T
        egy = egy[:tot_frames]
        zcr = librosa.feature.zero_crossing_rate(y=t_npy,
                                                 frame_length=self.win_length,
                                                 hop_length=self.hop_length).T
        zcr = zcr[:tot_frames]
        ret['egy'] = torch.FloatTensor(egy)
        ret['zcr'] = torch.FloatTensor(zcr)
        ntensor = tensor.clone()
        if hasattr(self, 'chopper'):
            do_chop = random.random() > 0.5
            if do_chop:
                ntensor = self.chopper(ntensor, self.sr)

        if hasattr(self, 'additive'):
            do_add = random.random() > 0.5
            if do_add:
                ntensor = self.additive(ntensor.numpy(), self.sr)

        if hasattr(self, 'clipping'):
            do_clip = random.random() > 0.5
            if do_clip:
                ntensor = self.clipping(ntensor.numpy())
        ret['wav'] = ntensor.view((-1, 1))
        ret['cwav'] = tensor.view((-1, 1))
        return ret