Exemplo n.º 1
0
    def get_model_input(cls, task, audio: Union[str, torch.Tensor]):
        input_type = task.data_cfg.hub.get("input_type", "fbank80")
        if input_type == "fbank80_w_utt_cmvn":
            if isinstance(audio, str):
                feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio))
                feat = feat.unsqueeze(0)  # T x D -> 1 x T x D
            else:
                feat = kaldi.fbank(audio, num_mel_bins=80).numpy()  # 1 x T x D
        elif input_type in {"waveform", "standardized_waveform"}:
            if isinstance(audio, str):
                feat, sr = get_wav(audio)  # C x T
                feat, _ = convert_wav(feat,
                                      sr,
                                      to_sample_rate=16_000,
                                      to_mono=True)  # C x T -> 1 x T
            else:
                feat = audio.numpy()
        else:
            raise ValueError(f"Unknown value: input_type = {input_type}")

        src_lengths = torch.Tensor([feat.shape[1]]).long()
        src_tokens = torch.from_numpy(feat)  # 1 x T (x D)
        if input_type == "standardized_waveform":
            with torch.no_grad():
                src_tokens = F.layer_norm(src_tokens, src_tokens.shape)

        return {
            "net_input": {
                "src_tokens": src_tokens,
                "src_lengths": src_lengths,
                "prev_output_tokens": None,
            },
            "target_lengths": None,
            "speaker": None,
        }
Exemplo n.º 2
0
    def __getitem__(self, index):
        # index=None
        import torchaudio
        import torchaudio.compliance.kaldi as kaldi
        # from . import kaldi as kaldi

        tgt_item = self.tgt[index] if self.tgt is not None else None
        print(index)
        path = self.aud_paths[index]
        if not os.path.exists(path):
            raise FileNotFoundError("Audio file not found: {}".format(path))
        sound, sample_rate = torchaudio.load_wav(path)
        output = kaldi.fbank(sound,
                             num_mel_bins=self.num_mel_bins,
                             frame_length=self.frame_length,
                             frame_shift=self.frame_shift)

        output_cmvn = data_utils.apply_mv_norm(output)
        self.s2s_collater = Seq2SeqCollater(0,
                                            1,
                                            pad_index=self.tgt_dict.pad(),
                                            eos_index=self.tgt_dict.eos(),
                                            move_eos_to_beginning=True)

        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
Exemplo n.º 3
0
    def __getitem__(self, index):
        import torchaudio
        import torchaudio.compliance.kaldi as kaldi
        tgt_item = self.tgt[index] if self.tgt is not None else None

        path = self.aud_paths[index]
        if not os.path.exists(path):
            raise FileNotFoundError("Audio file not found: {}".format(path))
        
        vid_data = self.load_video(index)
        sound, sample_rate = torchaudio.load_wav(path)
        
        if self.video_offset > 0: # positive offset - audio and video
            padding_frame = np.zeros([self.video_offset, np.shape(vid_data)[1]], dtype='float32')
            vid_data = np.concatenate((padding_frame,vid_data),axis=0)
        elif self.video_offset < 0: # negativte offset - video and audio
            padding_frame = np.zeros([abs(self.video_offset), np.shape(vid_data)[1]], dtype='float32')
            vid_data = np.concatenate((vid_data, padding_frame),axis=0)
            aud_padding_size = int(abs(self.video_offset) * 40 * sample_rate * 0.001)
            aud_padding = torch.zeros_like(sound)[:,0:aud_padding_size]
            sound = torch.cat((aud_padding, sound), 1)

        output = kaldi.fbank(
            sound,
            num_mel_bins=self.num_mel_bins,
            frame_length=self.frame_length,
            frame_shift=self.frame_shift
        )
        output_cmvn = data_utils.apply_mv_norm(output)

        return {"id": index, "audio_data": [output_cmvn.detach(), tgt_item], "video_data": [vid_data, tgt_item]}
Exemplo n.º 4
0
 def __call__(self, batch):
     mean_stat = torch.zeros(self.feat_dim)
     var_stat = torch.zeros(self.feat_dim)
     number = 0
     for item in batch:
         value = item[1].strip().split(",")
         assert len(value) == 3 or len(value) == 1
         wav_path = value[0]
         sample_rate = torchaudio.backend.sox_backend.info(wav_path)[0].rate
         # len(value) == 3 means segmented wav.scp,
         # len(value) == 1 means original wa.scp
         if len(value) == 3:
             start_frame = int(float(value[1]) * sample_rate)
             end_frame = int(float(value[2]) * sample_rate)
             waveform, sample_rate = torchaudio.backend.sox_backend.load(
                 filepath=wav_path,
                 num_frames=end_frame - start_frame,
                 offset=start_frame)
             waveform = waveform * (1 << 15)
         else:
             waveform, sample_rate = torchaudio.load_wav(item[1])
         mat = kaldi.fbank(waveform,
                           num_mel_bins=self.feat_dim,
                           dither=0.0,
                           energy_floor=0.0)
         mean_stat += torch.sum(mat, axis=0)
         var_stat += torch.sum(torch.square(mat), axis=0)
         number += mat.shape[0]
     return number, mean_stat, var_stat
Exemplo n.º 5
0
def get_torchaudio_fbank_or_mfcc(
    waveform: np.ndarray,
    sample_rate: float,
    n_bins: int = 80,
    feature_type: str = "fbank",
) -> np.ndarray:
    """Get mel-filter bank or mfcc features via TorchAudio."""
    try:
        import torchaudio.compliance.kaldi as ta_kaldi

        waveform = torch.from_numpy(waveform)
        if feature_type == "fbank":
            features = ta_kaldi.fbank(
                waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
            )
        else:
            features = ta_kaldi.mfcc(
                waveform,
                num_mel_bins=n_bins,
                num_ceps=40,
                low_freq=20,
                high_freq=-400,
                sample_frequency=sample_rate,
            )
        return features.numpy()
    except ImportError:
        raise ImportError(
            "Please install torchaudio to enable online feature extraction: pip install torchaudio"
        )
Exemplo n.º 6
0
def compute_fbank(data,
                  num_mel_bins=23,
                  frame_length=25,
                  frame_shift=10,
                  dither=0.0):
    """ Extract fbank

        Args:
            data: Iterable[{key, wav, label, sample_rate}]

        Returns:
            Iterable[{key, feat, label}]
    """
    for sample in data:
        assert 'sample_rate' in sample
        assert 'wav' in sample
        assert 'key' in sample
        assert 'label' in sample
        sample_rate = sample['sample_rate']
        waveform = sample['wav']
        waveform = waveform * (1 << 15)
        # Only keep key, feat, label
        mat = kaldi.fbank(waveform,
                          num_mel_bins=num_mel_bins,
                          frame_length=frame_length,
                          frame_shift=frame_shift,
                          dither=dither,
                          energy_floor=0.0,
                          sample_frequency=sample_rate)
        yield dict(key=sample['key'], label=sample['label'], feat=mat)
Exemplo n.º 7
0
    def __call__(self, new_samples):
        samples = self.previous_residual_samples + new_samples
        if len(samples) < self.num_samples_per_window:
            self.previous_residual_samples = samples
            return

        # num_frames is the number of frames from the new segment
        num_frames = math.floor(
            (len(samples) -
             self.len_ms_to_samples(self.window_size - self.shift_size)) /
            self.num_samples_per_shift)

        # the number of frames used for feature extraction
        # including some part of thte previous segment
        effective_num_samples = int(
            num_frames * self.len_ms_to_samples(self.shift_size) +
            self.len_ms_to_samples(self.window_size - self.shift_size))

        input_samples = samples[:effective_num_samples]
        self.previous_residual_samples = samples[num_frames *
                                                 self.num_samples_per_shift:]

        torch.manual_seed(1)
        output = kaldi.fbank(
            torch.FloatTensor(input_samples).unsqueeze(0),
            num_mel_bins=self.feature_dim,
            frame_length=self.window_size,
            frame_shift=self.shift_size,
        ).numpy()

        output = self.transform(output)

        return torch.from_numpy(output)
Exemplo n.º 8
0
 def _fbank_features(self, sound):
     output = kaldi.fbank(sound,
                          num_mel_bins=self.num_mel_bins,
                          frame_length=self.frame_length,
                          frame_shift=self.frame_shift)
     output_cmvn = data_utils.apply_mv_norm(output).detach()
     return output_cmvn
 def get_output_fn(sound, args):
     output = kaldi.fbank(sound,
                          blackman_coeff=args[1],
                          dither=0.0,
                          energy_floor=args[2],
                          frame_length=args[3],
                          frame_shift=args[4],
                          high_freq=args[5],
                          htk_compat=args[6],
                          low_freq=args[7],
                          num_mel_bins=args[8],
                          preemphasis_coefficient=args[9],
                          raw_energy=args[10],
                          remove_dc_offset=args[11],
                          round_to_power_of_two=args[12],
                          snip_edges=args[13],
                          subtract_mean=args[14],
                          use_energy=args[15],
                          use_log_fbank=args[16],
                          use_power=args[17],
                          vtln_high=args[18],
                          vtln_low=args[19],
                          vtln_warp=args[20],
                          window_type=args[21])
     return output
Exemplo n.º 10
0
 def encode_signal(self, signal):
     if isinstance(signal, np.ndarray):
         signal = torch.from_numpy(signal)
     encoded = kaldi.fbank(signal,
                           num_mel_bins=self.num_mel_bins,
                           frame_length=self.frame_length,
                           frame_shift=self.frame_shift)
     encoded = apply_mv_norm(encoded)
     return encoded
Exemplo n.º 11
0
 def get_feats(self, file_path):
     wav, sr = sf.read(file_path)
     feats = torch.from_numpy(wav).float()
     feats = kaldi.fbank(
         feats.unsqueeze(0),
         num_mel_bins=self.num_mel_bins,
         frame_length=self.frame_length,
         sample_frequency=sr,
     )
     return feats
Exemplo n.º 12
0
 def _extract_fbank_features(
     self,
     waveform: np.ndarray,
 ) -> np.ndarray:
     """
     Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
     and hence the waveform should not be normalized before feature extraction.
     """
     waveform = waveform * (2 ** 15)  # Kaldi compliance: 16-bit signed integers
     waveform = torch.from_numpy(waveform).unsqueeze(0)
     features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
     return features.numpy()
Exemplo n.º 13
0
def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]:
    """Get mel-filter bank features via TorchAudio."""
    try:
        import torch
        import torchaudio.compliance.kaldi as ta_kaldi

        waveform = torch.from_numpy(waveform).unsqueeze(0)
        features = ta_kaldi.fbank(
            waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
        )
        return features.numpy()
    except ImportError:
        return None
Exemplo n.º 14
0
    def set_feats_func(self):

        # initialize feats_function
        if self.configs["feats"]["type"] == "mfcc_kaldi":
            from torchaudio.compliance.kaldi import mfcc
            self.feats_func = lambda x: mfcc(torch.from_numpy(x.astype("float32").reshape(1, -1)), **self.configs["mfcc_kaldi"]).transpose(0, 1)
        elif self.configs["feats"]["type"] == "fbank_kaldi":
            from torchaudio.compliance.kaldi import fbank
            self.feats_func = lambda x: fbank(torch.from_numpy(x.astype("float32").reshape(1, -1)), **self.configs["fbank_kaldi"]).transpose(0, 1)
        elif self.configs["feats"]["type"] == "spectrogram_kaldi":
            from torchaudio.compliance.kaldi import spectrogram
            self.feats_func = lambda x: spectrogram(torch.from_numpy(x.astype("float32").reshape(1, -1)),
                                             **self.configs["spectrogram_kaldi"]).transpose(0, 1)
        else:
            raise NotImplementedError
Exemplo n.º 15
0
 def __call__(self, batch):
     mean_stat = torch.zeros(self.feat_dim)
     var_stat = torch.zeros(self.feat_dim)
     number = 0
     for item in batch:
         key = item[0]
         waveform, sample_rate = torchaudio.load_wav(item[1])
         mat = kaldi.fbank(waveform,
                           num_mel_bins=self.feat_dim,
                           dither=0.0,
                           energy_floor=0.0)
         mean_stat += torch.sum(mat, axis=0)
         var_stat += torch.sum(torch.square(mat), axis=0)
         number += mat.shape[0]
     return number, mean_stat, var_stat
Exemplo n.º 16
0
    def __getitem__(self, index):
        import torchaudio
        import torchaudio.compliance.kaldi as kaldi
        tgt_item = self.tgt[index] if self.tgt is not None else None

        path = self.aud_paths[index]
        if not os.path.exists(path):
            raise FileNotFoundError("Audio file not found: {}".format(path))
        sound, sample_rate = torchaudio.load_wav(path)
        output = kaldi.fbank(sound,
                             num_mel_bins=self.num_mel_bins,
                             frame_length=self.frame_length,
                             frame_shift=self.frame_shift)
        output_cmvn = data_utils.apply_mv_norm(output)

        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
Exemplo n.º 17
0
    def __call__(self, batch):
        mean_stat = torch.zeros(self.feat_dim)
        var_stat = torch.zeros(self.feat_dim)
        number = 0
        for item in batch:
            try:
                value = item[1].strip().split(",")
                assert len(value) == 3 or len(value) == 1
                wav_path = value[0]
                sample_rate = torchaudio.backend.sox_io_backend.info(
                    wav_path).sample_rate
                resample_rate = sample_rate
                # len(value) == 3 means segmented wav.scp,
                # len(value) == 1 means original wav.scp
                if len(value) == 3:
                    start_frame = int(float(value[1]) * sample_rate)
                    end_frame = int(float(value[2]) * sample_rate)
                    waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
                        filepath=wav_path,
                        num_frames=end_frame - start_frame,
                        frame_offset=start_frame)
                else:
                    waveform, sample_rate = torchaudio.load(item[1])

                waveform = waveform * (1 << 15)
                if self.resample_rate != 0 and self.resample_rate != sample_rate:
                    resample_rate = self.resample_rate
                    waveform = torchaudio.transforms.Resample(
                        orig_freq=sample_rate,
                        new_freq=resample_rate)(waveform)

                mat = kaldi.fbank(waveform,
                                  num_mel_bins=self.feat_dim,
                                  dither=0.0,
                                  energy_floor=0.0,
                                  sample_frequency=resample_rate)
                mean_stat += torch.sum(mat, axis=0)
                var_stat += torch.sum(torch.square(mat), axis=0)
                number += mat.shape[0]
            except (Exception) as e:
                print('read utterance {} error'.format(item[0]),
                      file=sys.stdout)
                pass
        return number, mean_stat, var_stat
Exemplo n.º 18
0
def wav_feat(wav_file, feature_extraction_conf):
    # for torchaudio<0.8
    if hasattr(wav_file, 'read'):
        waveform, sample_rate = wavform_filelike(wav_file)
    elif isinstance(wav_file, str):
        waveform, sample_rate = torchaudio.load_wav(wav_file)
    elif isinstance(wav_file, bytes):
        waveform, sample_rate = wavfrom_bytes(wav_file)

    wav_dither = 1.0

    mat = kaldi.fbank(
        waveform,
        num_mel_bins=feature_extraction_conf['mel_bins'],
        frame_length=feature_extraction_conf['frame_length'],
        frame_shift=feature_extraction_conf['frame_shift'],
        dither=wav_dither,
        energy_floor=0.0,
        sample_frequency=sample_rate
    )
    feat = mat
    length = mat.shape[0]
    return feat.unsqueeze(0), torch.tensor([length])
Exemplo n.º 19
0
def _get_torchaudio_fbank(waveform,
                          sample_rate,
                          n_bins=80) -> Optional[np.ndarray]:
    """Get mel-filter bank features via TorchAudio."""
    try:
        import torch
        import torchaudio.compliance.kaldi as ta_kaldi
        import torchaudio.sox_effects as ta_sox

        waveform = torch.from_numpy(waveform)
        if len(waveform.shape) == 1:
            # Mono channel: D -> 1 x D
            waveform = waveform.unsqueeze(0)
        else:
            # Merge multiple channels to one: C x D -> 1 x D
            waveform, _ = ta_sox.apply_effects_tensor(waveform, sample_rate,
                                                      ['channels', '1'])

        features = ta_kaldi.fbank(waveform,
                                  num_mel_bins=n_bins,
                                  sample_frequency=sample_rate)
        return features.numpy()
    except ImportError:
        return None
Exemplo n.º 20
0
        predictions = {}
        for emo_representation in self.emo_representations:
            seq_len = self.args[emo_representation]['args'].seq_length
            mean = self.args[emo_representation]['norm_mean']
            std = self.args[emo_representation]['norm_std']
            # feature normalization and padding has to be done for each
            # emotion representation individually because the means and
            # standard (deviations) (and sequence length) can be different
            features = normalize_and_pad_features(fbank, seq_len, torch.from_numpy(mean), torch.from_numpy(std))
            predictions[emo_representation] = softmax(
                self.models[emo_representation](features.unsqueeze(1)), dim=1
            ).detach().numpy()

        arousal_level = self.arousal_mapping[np.argmax(predictions['arousal'])]
        valence_level = self.valence_mapping[np.argmax(predictions['valence'])]
        category_label = self.category_mapping[np.argmax(predictions['category'])]

        return {'emotion': {'arousal': arousal_level,
                            'valence': valence_level,
                            'category': category_label,
                            'cateogry_probabilities':
                                np.around(predictions['category'], 2).reshape(-1)}}


if __name__ == '__main__':
    recognition = EmotionRecognition()
    waveform, sample_rate = torchaudio.load_wav('test.wav')
    f = fbank(waveform, sample_frequency=sample_rate)
    emo = recognition.predict_from_audio(f)
    print(emo)
Exemplo n.º 21
0
    def _feature_fn(self, *args, **kwargs):
        from torchaudio.compliance.kaldi import fbank

        return fbank(*args, **kwargs)
Exemplo n.º 22
0
    def forward(self, input, sample_rate):
        feat = kaldi.fbank(
            input, channel=-1, sample_frequency=sample_rate, num_mel_bins=self.n_mels
        )

        return feat
Exemplo n.º 23
0
def main(args):
    utils.import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    # print(args)
    print()
    print("*******************")
    print(args.task)
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(':'),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    generator = task.build_generator(args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def encode_fn(x):
        if tokenizer is not None:
            x = tokenizer.encode(x)
        if bpe is not None:
            x = bpe.encode(x)
        return x

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    def collate_frames(frames):
        """Convert a list of 2d frames into a padded 3d tensor
        Args:
            frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
                length of i-th frame and f_dim is static dimension of features
        Returns:
            3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
        """
        len_max = max(frame.size(0) for frame in frames)
        print(frames.size())
        print(frames[0].size())
        f_dim = frames[0].size(1)
        res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0)
        for i, v in enumerate(frames):
            res[i, :v.size(0)] = v
        return res

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    max_positions = utils.resolve_max_positions(
        task.max_positions(), *[model.max_positions() for model in models])

    # if args.buffer_size > 1:
    #     print('| Sentence buffer size:', args.buffer_size)
    # print('| Type the input sentence and press return:')
    start_id = 0
    audio_root_path = "/search/odin/haiyang/fairseq_exp/e2e_trans/fairseq/examples/speech_recognition/datasets/zh_asr_data/train/train2"
    with open(args.input) as inp:
        input = inp.readline().strip()
        while input:
            print()
            audio_path = audio_root_path + "/" + input.split(" ")[0] + ".wav"
            inputs = [" ".join(input.split(" ")[1:])]
            results = []
            for batch in make_batches(inputs, args, task, max_positions,
                                      encode_fn):
                src_tokens = batch.src_tokens
                src_lengths = batch.src_lengths
                if use_cuda:
                    src_tokens = src_tokens.cuda()
                    src_lengths = src_lengths.cuda()
                if args.task == "translation":
                    sample = {
                        'net_input': {
                            'src_tokens': src_tokens,
                            'src_lengths': src_lengths,
                        },
                    }
                else:
                    if not os.path.exists(audio_path):
                        raise FileNotFoundError(
                            "Audio file not found: {}".format(audio_path))
                    sound, sample_rate = torchaudio.load_wav(audio_path)
                    num_mel_bins, frame_length, frame_shift = 80, 25.0, 10.0

                    output = kaldi.fbank(sound,
                                         num_mel_bins=num_mel_bins,
                                         frame_length=frame_length,
                                         frame_shift=frame_shift,
                                         dither=0.0,
                                         energy_floor=1.0)

                    frames = data_utils.apply_mv_norm(output).detach()[
                        None, :, :].type(torch.cuda.FloatTensor)
                    # print(output_cmvn)
                    # frames = collate_frames(output_cmvn)
                    # sort samples by descending number of frames
                    # frames_lengths = torch.cuda.LongTensor(frames.size()[1])
                    frames_lengths = torch.LongTensor(
                        [s.size(0) for s in frames])

                    sample = {
                        'net_input': {
                            'src_tokens': src_tokens,
                            'src_lengths': src_lengths,
                            "audio": frames,
                            "audio_lengths": frames_lengths
                        },
                    }
                translations = task.inference_step(generator, models, sample)
                for i, (id, hypos) in enumerate(
                        zip(batch.ids.tolist(), translations)):
                    src_tokens_i = utils.strip_pad(src_tokens[i],
                                                   tgt_dict.pad())
                    results.append((start_id + id, src_tokens_i, hypos))

            # sort output to match input order
            for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]):
                if src_dict is not None:
                    src_str = src_dict.string(src_tokens, args.remove_bpe)
                    print('S-{}\t{}'.format(id, src_str))

                # Process top predictions
                for hypo in hypos[:min(len(hypos), args.nbest)]:
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'],
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )
                    hypo_str = decode_fn(hypo_str)
                    print('H-{}\t{}'.format(id, hypo_str))

                    # print('H-{}\t{}\t{}'.format(id, hypo['score'], hypo_str))
                    # print('P-{}\t{}'.format(
                    #     id,
                    #     ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist()))
                    # ))
                    if args.print_alignment:
                        alignment_str = " ".join([
                            "{}-{}".format(src, tgt) for src, tgt in alignment
                        ])
                        print('A-{}\t{}'.format(id, alignment_str))
            input = inp.readline().strip()

        # update running id counter
        start_id += len(inputs)
Exemplo n.º 24
0
        audio_list = load_wav_segments(args.wav_scp, args.segments)

    count = 0
    with open(args.out_ark, 'wb') as ark_fout, \
         open(args.out_scp, 'w', encoding='utf8') as scp_fout:
        for item in audio_list:
            if len(item) == 2:
                key, wav_path = item
                waveform, sample_rate = torchaudio.load_wav(wav_path)
            else:
                assert len(item) == 4
                key, wav_path, start, end = item
                sample_rate = torchaudio.info(wav_path).sample_rate
                frame_offset = int(start * sample_rate)
                num_frames = int((end - start) * sample_rate)
                waveform, sample_rate = torchaudio.load_wav(
                    wav_path, frame_offset, num_frames)

            mat = kaldi.fbank(waveform,
                              num_mel_bins=args.num_mel_bins,
                              frame_length=args.frame_length,
                              frame_shift=args.frame_shift,
                              dither=args.dither,
                              energy_floor=0.0,
                              sample_frequency=sample_rate)
            mat = mat.detach().numpy()
            kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout)
            count += 1
            if count % 10000 == 0:
                logging.info('Progress {}/{}'.format(count, len(audio_list)))
Exemplo n.º 25
0
def _extract_feature(batch, speed_perturb, wav_distortion_conf,
                     feature_extraction_conf):
    """ Extract acoustic fbank feature from origin waveform.

    Speed perturbation and wave amplitude distortion is optional.

    Args:
        batch: a list of tuple (wav id , wave path).
        speed_perturb: bool, whether or not to use speed pertubation.
        wav_distortion_conf: a dict , the config of wave amplitude distortion.
        feature_extraction_conf:a dict , the config of fbank extraction.

    Returns:
        (keys, feats, labels)
    """
    keys = []
    feats = []
    lengths = []
    wav_dither = wav_distortion_conf['wav_dither']
    wav_distortion_rate = wav_distortion_conf['wav_distortion_rate']
    distortion_methods_conf = wav_distortion_conf['distortion_methods']
    if speed_perturb:
        speeds = [1.0, 1.1, 0.9]
        weights = [1, 1, 1]
        speed = random.choices(speeds, weights, k=1)[0]
        # speed = random.choice(speeds)
    for i, x in enumerate(batch):
        try:
            wav = x[1]
            value = wav.strip().split(",")
            # 1 for general wav.scp, 3 for segmented wav.scp
            assert len(value) == 1 or len(value) == 3
            wav_path = value[0]
            sample_rate = torchaudio.backend.sox_io_backend.info(
                wav_path).sample_rate
            if 'resample' in feature_extraction_conf:
                resample_rate = feature_extraction_conf['resample']
            else:
                resample_rate = sample_rate
            if speed_perturb:
                if len(value) == 3:
                    logging.error(
                        "speed perturb does not support segmented wav.scp now")
                assert len(value) == 1
                waveform, sample_rate = _load_wav_with_speed(wav_path, speed)
            else:
                # value length 3 means using segmented wav.scp
                # incluede .wav, start time, end time
                if len(value) == 3:
                    start_frame = int(float(value[1]) * sample_rate)
                    end_frame = int(float(value[2]) * sample_rate)
                    waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
                        filepath=wav_path,
                        num_frames=end_frame - start_frame,
                        frame_offset=start_frame)
                else:
                    waveform, sample_rate = torchaudio.load(wav_path)
            waveform = waveform * (1 << 15)
            if resample_rate != sample_rate:
                waveform = torchaudio.transforms.Resample(
                    orig_freq=sample_rate, new_freq=resample_rate)(waveform)

            if wav_distortion_rate > 0.0:
                r = random.uniform(0, 1)
                if r < wav_distortion_rate:
                    waveform = waveform.detach().numpy()
                    waveform = _waveform_distortion(waveform,
                                                    distortion_methods_conf)
                    waveform = torch.from_numpy(waveform)
            mat = kaldi.fbank(
                waveform,
                num_mel_bins=feature_extraction_conf['mel_bins'],
                frame_length=feature_extraction_conf['frame_length'],
                frame_shift=feature_extraction_conf['frame_shift'],
                dither=wav_dither,
                energy_floor=0.0,
                sample_frequency=resample_rate)
            mat = mat.detach().numpy()
            feats.append(mat)
            keys.append(x[0])
            lengths.append(mat.shape[0])
        except (Exception) as e:
            print(e)
            logging.warn('read utterance {} error'.format(x[0]))
            pass
    # Sort it because sorting is required in pack/pad operation
    order = np.argsort(lengths)[::-1]
    sorted_keys = [keys[i] for i in order]
    sorted_feats = [feats[i] for i in order]
    labels = [x[2].split() for x in batch]
    labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels]
    sorted_labels = [labels[i] for i in order]
    return sorted_keys, sorted_feats, sorted_labels
Exemplo n.º 26
0
    def __getitem__(self, index):
        tgt_item = self.tgt[index] if self.tgt is not None else None
        src_item = self.src[index]
        # Append EOS to end of tgt sentence if it does not have an EOS and remove
        # EOS from end of src sentence if it exists. This is useful when we use
        # use existing datasets for opposite directions i.e., when we want to
        # use tgt_dataset as src_dataset and vice versa
        if self.append_eos_to_target:
            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
            if self.tgt and self.tgt[index][-1] != eos:
                tgt_item = torch.cat(
                    [self.tgt[index], torch.LongTensor([eos])])

        if self.append_bos:
            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
            if self.tgt and self.tgt[index][0] != bos:
                tgt_item = torch.cat(
                    [torch.LongTensor([bos]), self.tgt[index]])
            bos = self.src_dict.bos()
            if self.src[index][-1] != bos:
                src_item = torch.cat(
                    [torch.LongTensor([bos]), self.src[index]])
        if self.remove_eos_from_source:
            eos = self.src_dict.eos()
            if self.src[index][-1] == eos:
                src_item = self.src[index][:-1]
        # print(self.audio[])
        path = self.audio[str(index)]['input']['path']
        if not os.path.exists(path):
            raise FileNotFoundError("Audio file not found: {}".format(path))

        # print(path)
        # print(index)
        # exit()
        sound, sample_rate = torchaudio.load_wav(path)

        # print(self.num_mel_bins)
        # print(self.frame_length)
        # print(self.frame_shift)
        # print("&&&&&&&&&&&&&&&&&")
        # exit()
        # if "20170001P00053I0108" in path:
        #     pp=True
        # else:
        #     pp=False
        output = kaldi.fbank(sound,
                             num_mel_bins=self.num_mel_bins,
                             frame_length=self.frame_length,
                             frame_shift=self.frame_shift,
                             dither=0.0,
                             energy_floor=1.0)
        output_cmvn = data_utils.apply_mv_norm(output)
        # if "20170001P00053I0108" in path:
        #     print(path)
        #     print(sound)
        #     print(sample_rate)
        #     print("*******")
        #     print("output")
        #     print(output)
        #     print("output_cmvn")
        #     print(output_cmvn)
        # self.s2s_collater = Seq2SeqCollater(
        #     0, 1, pad_index=self.tgt_dict.pad(),
        #     eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True
        # )

        # return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
        example = {
            'id': index,
            'audio': output_cmvn.detach(),
            'source': src_item,
            'target': tgt_item,
        }
        if self.align_dataset is not None:
            example['alignment'] = self.align_dataset[index]
        return example
Exemplo n.º 27
0
def _extract_feature(batch, speed_perturb, wav_distortion_conf,
                     feature_extraction_conf, speech_aug, acousticsimulator,
                     acoustic_simulator_conf):
    """ Extract acoustic fbank feature from origin waveform.

    Speed perturbation and wave amplitude distortion is optional.

    Args:
        batch: a list of tuple (wav id , wave path).
        speed_perturb: bool, whether or not to use speed pertubation.
        wav_distortion_conf: a dict , the config of wave amplitude distortion.
        feature_extraction_conf:a dict , the config of fbank extraction.

    Returns:
        (keys, feats, labels)
    """
    keys = []
    feats = []
    lengths = []
    wav_dither = wav_distortion_conf['wav_dither']
    wav_distortion_rate = wav_distortion_conf['wav_distortion_rate']
    distortion_methods_conf = wav_distortion_conf['distortion_methods']

    if acoustic_simulator_conf is not None:
        acoustic_simulator_samplerate = acoustic_simulator_conf['samplerate']

    if speed_perturb:
        speeds = [1.0, 1.1, 0.9]
        weights = [1, 1, 1]
        speed = random.choices(speeds, weights, k=1)[0]
        # speed = random.choice(speeds)
    for i, x in enumerate(batch):
        try:
            wav = x[1]
            value = wav.strip().split(",")
            # 1 for general wav.scp, 3 for segmented wav.scp
            assert len(value) == 1 or len(value) == 3
            wav_path = value[0]
            sample_rate = torchaudio.backend.sox_io_backend.info(
                wav_path).sample_rate

            # 编解码模拟
            # (主要考虑16k降采样为8k 然后信道的编解码模拟)
            # (因此,这里判断语音原始的采样率是否为8k)
            # (也可以设置成其他的采样(只支持16k、8k),acoustic_simulator_conf['samplerate'])
            # (为8k的,代表原始语音为电话语音,不进行模拟)
            # (不为8k的,代表原始语音不为电话语音,需要进行模拟)
            if acousticsimulator is not None:
                if acoustic_simulator_samplerate != sample_rate:
                    # print(wav_path)
                    # print('acoustic simulator')
                    simulatored_wav_path = acousticsimulator.cmdFile(wav_path)
                    sample_rate = acoustic_simulator_samplerate
                    wav_path = simulatored_wav_path

                    # print(wav_path)

            if 'resample' in feature_extraction_conf:
                resample_rate = feature_extraction_conf['resample']
            else:
                resample_rate = sample_rate

            if speed_perturb:
                if len(value) == 3:
                    logging.error(
                        "speed perturb does not support segmented wav.scp now")
                assert len(value) == 1
                waveform, sample_rate = _load_wav_with_speed(wav_path, speed)
            else:
                # value length 3 means using segmented wav.scp
                # incluede .wav, start time, end time
                if len(value) == 3:
                    start_frame = int(float(value[1]) * sample_rate)
                    end_frame = int(float(value[2]) * sample_rate)
                    waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
                        filepath=wav_path,
                        num_frames=end_frame - start_frame,
                        frame_offset=start_frame)
                else:
                    waveform, sample_rate = torchaudio.load(wav_path)

            waveform = waveform * (1 << 15)

            if resample_rate != sample_rate:
                waveform = torchaudio.transforms.Resample(
                    orig_freq=sample_rate, new_freq=resample_rate)(waveform)

            if wav_distortion_rate > 0.0:
                r = random.uniform(0, 1)
                if r < wav_distortion_rate:
                    waveform = waveform.detach().numpy()
                    waveform = _waveform_distortion(waveform,
                                                    distortion_methods_conf)
                    waveform = torch.from_numpy(waveform)

            # 语音加噪、混响等扩充
            if speech_aug is not None:
                # 如果有多个aug的方式,会在根据random_weight选择,选择了某个加噪的方式,
                # 会根据prob去选择是否进行aug
                waveform, _ = speech_aug(waveform, torch.ones(1))
                # torchaudio.save('data/1.wav',waveform,resample_rate)

            mat = kaldi.fbank(
                waveform,
                num_mel_bins=feature_extraction_conf['mel_bins'],
                frame_length=feature_extraction_conf['frame_length'],
                frame_shift=feature_extraction_conf['frame_shift'],
                dither=wav_dither,
                energy_floor=0.0,
                sample_frequency=resample_rate)
            mat = mat.detach().numpy()
            feats.append(mat)
            keys.append(x[0])
            lengths.append(mat.shape[0])
        except (Exception) as e:
            print(e)
            logging.warning('read utterance {} error'.format(x[0]))
            pass
    # Sort it because sorting is required in pack/pad operation
    order = np.argsort(lengths)[::-1]
    sorted_keys = [keys[i] for i in order]
    sorted_feats = [feats[i] for i in order]
    labels = [x[2].split() for x in batch]
    labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels]
    sorted_labels = [labels[i] for i in order]
    return sorted_keys, sorted_feats, sorted_labels
Exemplo n.º 28
0
def _extract_feature(batch, speed_perturb, wav_distortion_conf,
                     feature_extraction_conf):
    """ Extract acoustic fbank feature from origin waveform.

    Speed perturbation and wave amplitude distortion is optional.

    Args:
        batch: a list of tuple (wav id , wave path).
        speed_perturb: bool, whether or not to use speed pertubation.
        wav_distortion_conf: a dict , the config of wave amplitude distortion.
        feature_extraction_conf:a dict , the config of fbank extraction.

    Returns:
        (keys, feats, labels)
    """
    keys = []
    feats = []
    lengths = []
    wav_dither = wav_distortion_conf['wav_dither']
    wav_distortion_rate = wav_distortion_conf['wav_distortion_rate']
    distortion_methods_conf = wav_distortion_conf['distortion_methods']
    if speed_perturb:
        speeds = [1.0, 1.1, 0.9]
        weights = [1, 1, 1]
        speed = random.choices(speeds, weights, k=1)[0]
        # speed = random.choice(speeds)
    for i, x in enumerate(batch):
        try:
            if speed_perturb:
                waveform, sample_rate = _load_wav_with_speed(x[1], speed)
            else:
                waveform, sample_rate = torchaudio.load_wav(x[1])
            if wav_distortion_rate > 0.0:
                r = random.uniform(0, 1)
                if r < wav_distortion_rate:
                    waveform = waveform.detach().numpy()
                    waveform = _waveform_distortion(waveform,
                                                    distortion_methods_conf)
                    waveform = torch.from_numpy(waveform)
            mat = kaldi.fbank(
                waveform,
                num_mel_bins=feature_extraction_conf['mel_bins'],
                frame_length=feature_extraction_conf['frame_length'],
                frame_shift=feature_extraction_conf['frame_shift'],
                dither=wav_dither,
                energy_floor=0.0,
                sample_frequency=sample_rate
            )
            mat = mat.detach().numpy()
            feats.append(mat)
            keys.append(x[0])
            lengths.append(mat.shape[0])
        except (Exception) as e:
            print(e)
            logging.warn('read utterance {} error'.format(x[0]))
            pass
    # Sort it because sorting is required in pack/pad operation
    order = np.argsort(lengths)[::-1]
    sorted_keys = [keys[i] for i in order]
    sorted_feats = [feats[i] for i in order]
    labels = [x[2].split() for x in batch]
    labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels]
    sorted_labels = [labels[i] for i in order]
    return sorted_keys, sorted_feats, sorted_labels
Exemplo n.º 29
0
 def __get_sample_fbank():
     waveform, sample_rate = load('resource/SI1657.wav')
     return fbank(waveform, num_mel_bins=83, window_type=HAMMING)