Пример #1
0
    def _perform_speech_activity_detection(self):
        """
        Checks for type of speech activity detection from config. Choices are NeMo VAD,
        external vad manifest and oracle VAD (generates speech activity labels from provided RTTM files)
        """
        if self.has_vad_model:
            self._dont_auto_split = False
            self._split_duration = 50
            manifest_vad_input = self._diarizer_params.manifest_filepath

            if not self._dont_auto_split:
                logging.info(
                    "Split long audio file to avoid CUDA memory issue")
                logging.debug(
                    "Try smaller split_duration if you still have CUDA memory issue"
                )
                config = {
                    'manifest_filepath': manifest_vad_input,
                    'time_length': self._vad_window_length_in_sec,
                    'split_duration': self._split_duration,
                    'num_workers': self._cfg.num_workers,
                }
                manifest_vad_input = prepare_manifest(config)
            else:
                logging.warning(
                    "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
                )

            self._setup_vad_test_data(manifest_vad_input)
            self._run_vad(manifest_vad_input)

        elif self._diarizer_params.vad.external_vad_manifest is not None:
            self._speaker_manifest_path = self._diarizer_params.vad.external_vad_manifest
        elif self._diarizer_params.oracle_vad:
            self._speaker_manifest_path = os.path.join(
                self._speaker_dir, 'oracle_vad_manifest.json')
            self._speaker_manifest_path = write_rttm2manifest(
                self.AUDIO_RTTM_MAP, self._speaker_manifest_path)
        else:
            raise ValueError(
                "Only one of diarizer.oracle_vad, vad.model_path or vad.external_vad_manifest must be passed"
            )
Пример #2
0
    def diarize(self,
                paths2audio_files: List[str] = None,
                batch_size: int = 1):
        """
        """

        if paths2audio_files:
            self.paths2audio_files = paths2audio_files
        else:
            if self._cfg.diarizer.paths2audio_files is None:
                raise ValueError(
                    "Pass path2audio files either through config or to diarize method"
                )
            else:
                self.paths2audio_files = self._cfg.diarizer.paths2audio_files

        if type(self.paths2audio_files) is str and os.path.isfile(
                self.paths2audio_files):
            paths2audio_files = []
            with open(self.paths2audio_files, 'r') as path2file:
                for audiofile in path2file.readlines():
                    audiofile = audiofile.strip()
                    paths2audio_files.append(audiofile)

        elif type(self.paths2audio_files) in [list, ListConfig]:
            paths2audio_files = list(self.paths2audio_files)

        else:
            raise ValueError(
                "paths2audio_files must be of type list or path to file containing audio files"
            )

        self.AUDIO_RTTM_MAP = audio_rttm_map(
            paths2audio_files, self._cfg.diarizer.path2groundtruth_rttm_files)

        if self.has_vad_model:
            logging.info("Performing VAD")
            mfst_file = self.path2audio_files_to_manifest(paths2audio_files)
            self._dont_auto_split = False
            self._split_duration = 50
            manifest_vad_input = mfst_file

            if not self._dont_auto_split:
                logging.info(
                    "Split long audio file to avoid CUDA memory issue")
                logging.debug(
                    "Try smaller split_duration if you still have CUDA memory issue"
                )
                config = {
                    'manifest_filepath': mfst_file,
                    'time_length': self._vad_window_length_in_sec,
                    'split_duration': self._split_duration,
                    'num_workers': self._cfg.num_workers,
                }
                manifest_vad_input = prepare_manifest(config)
            else:
                logging.warning(
                    "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
                )

            self._setup_vad_test_data(manifest_vad_input)
            self._run_vad(manifest_vad_input)
        else:
            if not os.path.exists(self._speaker_manifest_path):
                raise NotFoundError("Oracle VAD based manifest file not found")

        self._extract_embeddings(self._speaker_manifest_path)
        out_rttm_dir = os.path.join(self._out_dir, 'pred_rttms')
        os.makedirs(out_rttm_dir, exist_ok=True)

        perform_diarization(
            embeddings_file=self._embeddings_file,
            reco2num=self._num_speakers,
            manifest_path=self._speaker_manifest_path,
            sample_rate=self._cfg.sample_rate,
            window=self._cfg.diarizer.speaker_embeddings.window_length_in_sec,
            shift=self._cfg.diarizer.speaker_embeddings.shift_length_in_sec,
            audio_rttm_map=self.AUDIO_RTTM_MAP,
            out_rttm_dir=out_rttm_dir,
            max_num_speakers=self.max_num_speakers,
        )
Пример #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--vad_model",
                        type=str,
                        default="MatchboxNet-VAD-3x2",
                        required=False,
                        help="Pass: '******'")
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help=
        "Path of json file of evaluation data. Audio files should have unique names.",
    )
    parser.add_argument("--out_dir",
                        type=str,
                        default="vad_frame",
                        help="Dir of your vad outputs")
    parser.add_argument("--time_length", type=float, default=0.63)
    parser.add_argument("--shift_length", type=float, default=0.01)
    parser.add_argument("--normalize_audio", type=bool, default=False)
    parser.add_argument("--num_workers", type=float, default=20)
    parser.add_argument("--split_duration", type=float, default=400)
    parser.add_argument(
        "--dont_auto_split",
        default=False,
        action='store_true',
        help=
        "Whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.",
    )

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.vad_model.endswith('.nemo'):
        logging.info(f"Using local VAD model from {args.vad_model}")
        vad_model = EncDecClassificationModel.restore_from(
            restore_path=args.vad_model)
    else:
        logging.info(f"Using NGC cloud VAD model {args.vad_model}")
        vad_model = EncDecClassificationModel.from_pretrained(
            model_name=args.vad_model)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    # Prepare manifest for streaming VAD
    manifest_vad_input = args.dataset
    if not args.dont_auto_split:
        logging.info("Split long audio file to avoid CUDA memory issue")
        logging.debug(
            "Try smaller split_duration if you still have CUDA memory issue")
        config = {
            'manifest_filepath': manifest_vad_input,
            'time_length': args.time_length,
            'split_duration': args.split_duration,
            'num_workers': args.num_workers,
        }
        manifest_vad_input = prepare_manifest(config)
    else:
        logging.warning(
            "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
        )

    # setup_test_data
    vad_model.setup_test_data(
        test_data_config={
            'vad_stream': True,
            'sample_rate': 16000,
            'manifest_filepath': manifest_vad_input,
            'labels': [
                'infer',
            ],
            'num_workers': args.num_workers,
            'shuffle': False,
            'time_length': args.time_length,
            'shift_length': args.shift_length,
            'trim_silence': False,
            'normalize_audio': args.normalize_audio,
        })

    vad_model = vad_model.to(device)
    vad_model.eval()

    time_unit = int(args.time_length / args.shift_length)
    trunc = int(time_unit / 2)
    trunc_l = time_unit - trunc
    all_len = 0

    data = []
    for line in open(args.dataset, 'r'):
        file = json.loads(line)['audio_filepath'].split("/")[-1]
        data.append(file.split(".wav")[0])
    logging.info(f"Inference on {len(data)} audio files/json lines!")

    status = get_vad_stream_status(data)
    for i, test_batch in enumerate(vad_model.test_dataloader()):
        test_batch = [x.to(device) for x in test_batch]
        with autocast():
            log_probs = vad_model(input_signal=test_batch[0],
                                  input_signal_length=test_batch[1])
            probs = torch.softmax(log_probs, dim=-1)
            pred = probs[:, 1]

            if status[i] == 'start':
                to_save = pred[:-trunc]
            elif status[i] == 'next':
                to_save = pred[trunc:-trunc_l]
            elif status[i] == 'end':
                to_save = pred[trunc_l:]
            else:
                to_save = pred

            all_len += len(to_save)
            outpath = os.path.join(args.out_dir, data[i] + ".frame")
            with open(outpath, "a") as fout:
                for f in range(len(to_save)):
                    fout.write('{0:0.4f}\n'.format(to_save[f]))
        del test_batch
        if status[i] == 'end' or status[i] == 'single':
            logging.debug(
                f"Overall length of prediction of {data[i]} is {all_len}!")
            all_len = 0
Пример #4
0
def main(cfg):
    if not cfg.dataset:
        raise ValueError("You must input the path of json file of evaluation data")

    # each line of dataset should be have different audio_filepath and unique name to simplfiy edge cases or conditions
    key_meta_map = {}
    with open(cfg.dataset, 'r') as manifest:
        for line in manifest.readlines():
            audio_filepath = json.loads(line.strip())['audio_filepath']
            uniq_audio_name = audio_filepath.split('/')[-1].rsplit('.', 1)[0]
            if uniq_audio_name in key_meta_map:
                raise ValueError("Please make sure each line is with different audio_filepath! ")
            key_meta_map[uniq_audio_name] = {'audio_filepath': audio_filepath}

    # Prepare manifest for streaming VAD
    manifest_vad_input = cfg.dataset
    if cfg.prepare_manifest.auto_split:
        logging.info("Split long audio file to avoid CUDA memory issue")
        logging.debug("Try smaller split_duration if you still have CUDA memory issue")
        config = {
            'input': manifest_vad_input,
            'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,
            'split_duration': cfg.prepare_manifest.split_duration,
            'num_workers': cfg.num_workers,
            'prepared_manfiest_vad_input': cfg.prepared_manfiest_vad_input,
        }
        manifest_vad_input = prepare_manifest(config)
    else:
        logging.warning(
            "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
        )

    torch.set_grad_enabled(False)
    vad_model = init_vad_model(cfg.vad.model_path)

    # setup_test_data
    vad_model.setup_test_data(
        test_data_config={
            'vad_stream': True,
            'sample_rate': 16000,
            'manifest_filepath': manifest_vad_input,
            'labels': ['infer',],
            'num_workers': cfg.num_workers,
            'shuffle': False,
            'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,
            'shift_length_in_sec': cfg.vad.parameters.shift_length_in_sec,
            'trim_silence': False,
            'normalize_audio': cfg.vad.parameters.normalize_audio,
        }
    )

    vad_model = vad_model.to(device)
    vad_model.eval()

    if not os.path.exists(cfg.frame_out_dir):
        os.mkdir(cfg.frame_out_dir)
    else:
        logging.warning(
            "Note frame_out_dir exists. If new file has same name as file inside existing folder, it will append result to existing file and might cause mistakes for next steps."
        )

    logging.info("Generating frame level prediction ")
    pred_dir = generate_vad_frame_pred(
        vad_model=vad_model,
        window_length_in_sec=cfg.vad.parameters.window_length_in_sec,
        shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec,
        manifest_vad_input=manifest_vad_input,
        out_dir=cfg.frame_out_dir,
    )
    logging.info(
        f"Finish generating VAD frame level prediction with window_length_in_sec={cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={cfg.vad.parameters.shift_length_in_sec}"
    )

    # overlap smoothing filter
    if cfg.gen_overlap_seq:
        # Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments.
        # smoothing_method would be either in majority vote (median) or average (mean)
        logging.info("Generating predictions with overlapping input segments")
        smoothing_pred_dir = generate_overlap_vad_seq(
            frame_pred_dir=pred_dir,
            smoothing_method=cfg.vad.parameters.smoothing,
            overlap=cfg.vad.parameters.overlap,
            window_length_in_sec=cfg.vad.parameters.window_length_in_sec,
            shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec,
            num_workers=cfg.num_workers,
            out_dir=cfg.smoothing_out_dir,
        )
        logging.info(
            f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}"
        )
        pred_dir = smoothing_pred_dir

    # postprocessing and generate speech segments
    if cfg.gen_seg_table:
        logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.")
        table_out_dir = generate_vad_segment_table(
            vad_pred_dir=pred_dir,
            postprocessing_params=cfg.vad.parameters.postprocessing,
            shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec,
            num_workers=cfg.num_workers,
            out_dir=cfg.table_out_dir,
        )
        logging.info(
            f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}"
        )

    if cfg.write_to_manifest:
        for i in key_meta_map:
            key_meta_map[i]['rttm_filepath'] = os.path.join(table_out_dir, i + ".txt")

        if not cfg.out_manifest_filepath:
            out_manifest_filepath = "vad_out.json"
        else:
            out_manifest_filepath = cfg.out_manifest_filepath
        out_manifest_filepath = write_rttm2manifest(key_meta_map, out_manifest_filepath)
        logging.info(f"Writing VAD output to manifest: {out_manifest_filepath}")
Пример #5
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--inp_dir",
                        type=str,
                        required=True,
                        help="(full path) folder of files to be processed")
    parser.add_argument(
        "--inp_list",
        type=str,
        help=
        "(full path) a file contains NAME of files inside inp_dir to be processed"
    )
    parser.add_argument(
        "--out_dir",
        type=str,
        default=".",
        help="(full path) location to store generated json file")
    parser.add_argument("--manifest_name",
                        type=str,
                        default="generated_manifest",
                        help="name of generated json file")
    parser.add_argument("--split_duration",
                        type=int,
                        required=True,
                        help="max duration of each audio clip/line")
    parser.add_argument(
        "--window_length_in_sec",
        type=float,
        default=0.63,
        help="window length in sec for VAD context input , default is 0.63s",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=4,
                        help="number of workers for multiprocessing")

    args = parser.parse_args()

    if not args.inp_list:
        input_audios = []
        for root, dirs, files in os.walk(args.inp_dir):
            for basename in files:
                if basename.endswith('.wav'):
                    filename = os.path.join(root, basename)
                    input_audios.append(filename)
    else:
        name_list = np.loadtxt(args.inp_list, dtype='str')
        input_audios = [
            os.path.join(args.inp_dir, name + ".wav") for name in name_list
        ]

    input_list = []
    for i in input_audios:
        input_list.append({'audio_filepath': i, "offset": 0, "duration": None})

    logging.info(f"Number of wav files to be processed: {len(input_audios)}")
    output_path = os.path.join(args.out_dir, args.manifest_name + '.json')

    logging.info("Split long audio file to avoid CUDA memory issue")
    logging.debug(
        "Try smaller split_duration if you still have CUDA memory issue")

    config = {
        'input': input_list,
        'window_length_in_sec': args.window_length_in_sec,
        'split_duration': args.split_duration,
        'num_workers': args.num_workers,
        'prepared_manfiest_vad_input': output_path,
    }
    manifest_vad_input = prepare_manifest(config)
    logging.info(f"Done! Save to {manifest_vad_input}")