示例#1
0
def get_vad(vad_dir, file_name, vad_suffix, sig, fea):
    """ Perform voice activity detection or load it from file.

        :param vad_dir: directory with vad files
        :type vad_dir: str
        :param file_name: name of the file
        :type file_name: str
        :param vad_suffix: suffix of vad files
        :type vad_suffix
        :param sig: input signal
        :type sig: numpy.array
        :param fea: MFCCs
        :type fea: numpy.array
        :returns: VAD segments - list with boolean values
        :rtype: list
    """
    if vad_dir is None:
        loginfo('[wav2ivec.get_vad] Computing VAD ...')
        return compute_vad(sig,
                           win_length=WINDOWSIZE / SOURCERATE,
                           win_overlap=(WINDOWSIZE - TARGETRATE) /
                           SOURCERATE)[:len(fea)]
    else:
        vad = os.path.join(vad_dir, file_name) + vad_suffix
        loginfo('[wav2ivec.get_vad] Loading VAD from file {} ...'.format(
            file_name))
        return load_vad_lab_as_bool_vec(vad)[:len(fea)]
示例#2
0
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 model,
                 wav_suffix='.wav',
                 vad_suffix='.lab.gz'):
    """ Extract i-vector from wav file.

        :param wav_dir: directory with wav files
        :type wav_dir: str
        :param vad_dir: directory with vad files
        :type vad_dir: str
        :param out_dir: output directory
        :type out_dir: str
        :param file_name: name of the file
        :type file_name: str
        :param model: input models for i-vector extraction
        :type model: tuple
        :param wav_suffix: suffix of wav files
        :type wav_suffix: str
        :param vad_suffix: suffix of vad files
        :type vad_suffix
    """
    loginfo('[wav2ivec.process_file] Processing file {} ...'.format(file_name))
    ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model
    wav = os.path.join(wav_dir, file_name) + wav_suffix
    rate, sig = read(wav)
    if len(sig.shape) != 1:
        raise GeneralException(
            '[wav2ivec.process_file] Expected mono as input audio.')
    if rate != 8000:
        logwarning(
            '[wav2ivec.process_file] '
            'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.'
            .format(rate))
        sig = signal.resample(sig, 8000)
    if ADDDITHER > 0.0:
        loginfo('[wav2ivec.process_file] Adding dither ...')
        sig = features.add_dither(sig, ADDDITHER)

    fea = get_mfccs(sig)
    vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig,
                                       fea)

    fea = fea[vad, ...]
    w = get_ivec(fea, numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt)
    if w is not None:
        Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
        np.save(os.path.join(out_dir, file_name), w)
示例#3
0
def get_mfccs(sig):
    """ Extract MFCC featrues from signal.

        :param sig: input signal
        :type sig: numpy.array
        :returns: MFCCs
        :rtype: numpy.array
    """
    loginfo('[wav2ivec.get_mfccs] Extracting MFCC features ...')
    fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE,
                                     fs=fs,
                                     NUMCHANS=NUMCHANS,
                                     LOFREQ=LOFREQ,
                                     HIFREQ=HIFREQ)
    fea = features.mfcc_htk(sig,
                            window=WINDOWSIZE / SOURCERATE,
                            noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE,
                            fbank_mx=fbank_mx,
                            _0='first',
                            NUMCEPS=NUMCEPS,
                            RAWENERGY=RAWENERGY,
                            PREEMCOEF=PREEMCOEF,
                            CEPLIFTER=CEPLIFTER,
                            ZMEANSOURCE=ZMEANSOURCE,
                            ENORMALISE=ENORMALISE,
                            ESCALE=0.1,
                            SILFLOOR=50.0,
                            USEHAMMING=True)

    loginfo('[wav2ivec.get_mfccs] Adding derivatives ...')
    fea = features.add_deriv(fea, (deltawindow, accwindow))

    loginfo('[wav2ivec.get_mfccs] Reshaping to SFeaCat conventions ...')
    return fea.reshape(fea.shape[0], 3, -1).transpose(
        (0, 2, 1)).reshape(fea.shape[0], -1)
示例#4
0
def init(ubm_file, v_file):
    """ Initialize i-vector extractor.

        :param ubm_file: path to UBM
        :type ubm_file: str
        :param v_file: path to v matrix
        :type v_file: str
        :returns: loaded and initialized models
        :rtype: tuple
    """
    loginfo('[wav2ivec.init] Loading UBM file ...')
    ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file)
    gmm_model = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs)
    numg = ubm_means.shape[0]
    dimf = ubm_means.shape[1]
    if ubm_covs.shape[1] == dimf:
        ubm_norm = 1 / np.sqrt(ubm_covs)
    else:
        ubm_norm = None
    loginfo('[wav2ivec.init] Loading V model file ...')
    v = np.load(v_file)
    mvvt = iv.compute_VtV(v, numg)
    return ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt
示例#5
0
def get_ivec(fea, numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt):
    loginfo('[wav2ivec.get_ivec] Applying floating CMVN ...')
    fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True)
    n_data, d_data = fea.shape
    l = 0
    lc = 0
    n = np.zeros(numg, dtype=np.float32)
    f = np.zeros((numg, dimf), dtype=np.float32)

    loginfo('[wav2ivec.get_ivec] Computing statistics ...')
    seq_data = split_seq(range(n_data), 1000)
    for i in range(len(seq_data)):
        dd = fea[seq_data[i], :]
        l1, n1, f1 = gmm.gmm_eval(dd, gmm_model, return_accums=1)
        l = l + l1.sum()
        lc = lc + l1.shape[0]
        n = n + n1
        f = f + f1
    n, f = normalize_stats(n, f, ubm_means, ubm_norm)
    f = row(f.astype(v.dtype))
    n = row(n.astype(v.dtype))

    loginfo('[wav2ivec.get_ivec] Computing i-vector ...')
    return iv.estimate_i(n, f, v, mvvt).T
示例#6
0
def main(argv):
    parser = argparse.ArgumentParser(
        'Run diarization on input data with PLDA model.')
    parser.add_argument('-l',
                        '--input-list',
                        help='list of input files without suffix',
                        action='store',
                        dest='input_list',
                        type=str,
                        required=True)
    parser.add_argument(
        '--norm-list',
        help='list of input normalization files without suffix in .npy format',
        action='store',
        dest='norm_list',
        type=str,
        required=False)
    parser.add_argument(
        '--ivecs-dir',
        help=
        'directory containing i-vectos using class IvecSet - pickle format',
        action='store',
        dest='ivecs_dir',
        type=str,
        required=True)
    parser.add_argument('--out-dir',
                        help='output directory for storing .rttm files',
                        action='store',
                        dest='out_dir',
                        type=str,
                        required=False)
    parser.add_argument('--reference',
                        help='reference rttm file for system scoring',
                        action='store',
                        dest='reference',
                        type=str,
                        required=False)
    parser.add_argument('--plda-model-dir',
                        help='directory with PLDA model files',
                        action='store',
                        dest='plda_model_dir',
                        type=str,
                        required=True)
    parser.add_argument('-j',
                        '--num-cores',
                        help='number of processor cores to use',
                        action='store',
                        dest='num_cores',
                        type=int,
                        required=False)
    parser.set_defaults(norm_list=None)
    parser.set_defaults(num_cores=multiprocessing.cpu_count())
    args = parser.parse_args()
    if args.reference is None and args.out_dir is None:
        parser.print_help()
        sys.stderr.write(
            'at least one of --reference and --out-dir must be specified' +
            os.linesep)

    loginfo('[diar.main] Setting {} processor cores for the MKL library ...'.
            format(args.num_cores))
    set_mkl(args.num_cores)
    diar = Diarization(args.input_list, args.norm_list, args.ivecs_dir,
                       args.out_dir, args.plda_model_dir)
    scores = diar.score()
    if args.reference is not None:
        diar.get_der(args.reference, scores)
    if args.out_dir is not None:
        diar.dump_rttm(scores)

    return 0
示例#7
0
def main(argv):
    parser = argparse.ArgumentParser(
        'Run diarization on input data with PLDA model.')
    parser.add_argument('-l',
                        '--input-list',
                        help='list of input files without suffix',
                        action='store',
                        dest='input_list',
                        type=str,
                        required=True)
    parser.add_argument(
        '--norm-list',
        help='list of input normalization files without suffix in .npy format',
        action='store',
        dest='norm_list',
        type=str,
        required=False)
    parser.add_argument(
        '--ivecs-dir',
        help=
        'directory containing i-vectos using class IvecSet - pickle format',
        action='store',
        dest='ivecs_dir',
        type=str,
        required=True)
    parser.add_argument('--out-dir',
                        help='output directory for storing .rttm files',
                        action='store',
                        dest='out_dir',
                        type=str,
                        required=False)
    parser.add_argument('--reference',
                        help='reference rttm file for system scoring',
                        action='store',
                        dest='reference',
                        type=str,
                        required=False)
    parser.add_argument('--plda-model-dir',
                        help='directory with PLDA model files',
                        action='store',
                        dest='plda_model_dir',
                        type=str,
                        required=True)
    parser.add_argument('-j',
                        '--num-cores',
                        help='number of processor cores to use',
                        action='store',
                        dest='num_cores',
                        type=int,
                        required=False)
    parser.set_defaults(norm_list=None)
    parser.set_defaults(num_cores=multiprocessing.cpu_count())
    args = parser.parse_args()
    if args.reference is None and args.out_dir is None:
        parser.print_help()
        sys.stderr.write(
            'at least one of --reference and --out-dir must be specified' +
            os.linesep)

    loginfo('[diar.main] Using {} threads...'.format(args.num_cores))
    # TODO: Currently, num_cores has no effect. Consider refactoring the
    #       code in lib.diarization so that clutering is done in parallel in
    #       num_cores threads. This is probably a better idea than using
    #       multithreading in the BLAS or with the actual k-means in sklearn
    #       (passing n_jobs to KMeans will ensure that each initialization is
    #       processed in a separate thread...well, not actually a thread...
    #       stupid GIL...abuse of ellipses)
    diar = Diarization(args.input_list, args.norm_list, args.ivecs_dir,
                       args.out_dir, args.plda_model_dir)
    scores = diar.score()
    if args.reference is not None:
        diar.get_der(args.reference, scores)
    if args.out_dir is not None:
        diar.dump_rttm(scores)

    return 0
示例#8
0
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 model,
                 min_size,
                 max_size,
                 tolerance,
                 wav_suffix='.wav',
                 vad_suffix='.lab.gz'):
    """ Extract i-vectors from wav file.

        :param wav_dir: directory with wav files
        :type wav_dir: str
        :param vad_dir: directory with vad files
        :type vad_dir: str
        :param out_dir: output directory
        :type out_dir: str
        :param file_name: name of the file
        :type file_name: str
        :param model: input models for i-vector extraction
        :type model: tuple
        :param min_size: minimal size of window in ms
        :type min_size: int
        :param max_size: maximal size of window in ms
        :type max_size: int
        :param tolerance: accept given number of frames as speech even when it
                          is marked as silence
        :type tolerance: int
        :param wav_suffix: suffix of wav files
        :type wav_suffix: str
        :param vad_suffix: suffix of vad files
        :type vad_suffix
    """
    loginfo('[wav2ivecs.process_file] Processing file {} '
            '...'.format(file_name))
    _, ubm_means, _, ubm_norm, gmm_model, numg, dimf, v, mvvt = model
    ubm_means = model[1]
    ubm_norm = model[3]
    gmm_model = model[4]

    if len(file_name.split()) > 1:
        file_name = file_name.split()[0]
    wav = os.path.join(wav_dir, file_name) + wav_suffix
    rate = get_sr(wav)
    #
    loginfo(wav)
    #
    if rate != 8000:
        logwarning('[wav2ivec.process_file] '
                   'The input file is expected to be in 8000 Hz, got {} Hz '
                   'instead, resampling.'.format(rate))
    rate, sig = af_to_array(wav, target_sr=8000)
    if ADDDITHER > 0.0:  # TODO: Where is this defined?
        #loginfo('[wav2ivecs.process_file] Adding dither ...')
        sig = features.add_dither(sig, ADDDITHER)

    fea = get_mfccs(sig)
    vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig,
                                       fea)

    ivec_set = IvecSet()
    ivec_set.name = file_name
    for seg in get_segments(vad, min_size, max_size, tolerance):
        start, end = get_num_segments(seg[0]), get_num_segments(seg[1])
        w = get_ivec(fea[seg[0]:seg[1] + 1], numg, dimf, gmm_model, ubm_means,
                     ubm_norm, v, mvvt)
        if w is None:
            continue
        ivec_set.add(w, start, end)
    Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
    ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))
示例#9
0
def main(argv):
    parser = argparse.ArgumentParser(
        'Extract i-vector from audio wav files 1:1.')
    parser.add_argument('-l',
                        '--input-list',
                        help='list of input files without suffix',
                        action='store',
                        dest='input_list',
                        type=str,
                        required=True)
    parser.add_argument(
        '--audio-dir',
        help='directory with audio files in .wav format - 8000Hz, 16bit-s, 1c',
        action='store',
        dest='audio_dir',
        type=str,
        required=True)
    parser.add_argument('-wav-suffix',
                        help='wav file suffix',
                        action='store',
                        dest='wav_suffix',
                        type=str,
                        required=False)
    parser.add_argument(
        '--vad-dir',
        help='directory with lab files - Voice/Speech activity detection',
        action='store',
        dest='vad_dir',
        type=str,
        required=False)
    parser.add_argument('-vad-suffix',
                        help='Voice Activity Detector file suffix',
                        action='store',
                        dest='vad_suffix',
                        type=str,
                        required=False)
    parser.add_argument('--out-dir',
                        help='output directory for storing i-vectors',
                        action='store',
                        dest='out_dir',
                        type=str,
                        required=True)
    parser.add_argument('--ubm-file',
                        help='Universal Background Model file',
                        action='store',
                        dest='ubm_file',
                        type=str,
                        required=True)
    parser.add_argument('--v-file',
                        help='V Model file',
                        action='store',
                        dest='v_file',
                        type=str,
                        required=True)
    parser.add_argument('-j',
                        '--num-cores',
                        help='number of processor cores to use',
                        action='store',
                        dest='num_cores',
                        type=int,
                        required=False)
    parser.set_defaults(num_cores=multiprocessing.cpu_count())
    parser.set_defaults(wav_suffix='.wav')
    parser.set_defaults(vad_suffix='.lab.gz')
    args = parser.parse_args()

    models = init(args.ubm_file, args.v_file)
    loginfo(
        '[wav2ivecs.main] Setting {} processor cores for the MKL library ...'.
        format(args.num_cores))
    set_mkl(args.num_cores)
    files = [line.rstrip('\n') for line in open(args.input_list)]
    for f in files:
        process_file(args.audio_dir,
                     args.vad_dir,
                     args.out_dir,
                     f,
                     models,
                     wav_suffix=args.wav_suffix,
                     vad_suffix=args.vad_suffix)

    return 0
示例#10
0
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 model,
                 min_size,
                 max_size,
                 tolerance,
                 wav_suffix='.wav',
                 vad_suffix='.lab.gz'):
    """ Extract i-vectors from wav file.

        :param wav_dir: directory with wav files
        :type wav_dir: str
        :param vad_dir: directory with vad files
        :type vad_dir: str
        :param out_dir: output directory
        :type out_dir: str
        :param file_name: name of the file
        :type file_name: str
        :param model: input models for i-vector extraction
        :type model: tuple
        :param min_size: minimal size of window in ms
        :type min_size: int
        :param max_size: maximal size of window in ms
        :type max_size: int
        :param tolerance: accept given number of frames as speech even when it is marked as silence
        :type tolerance: int
        :param wav_suffix: suffix of wav files
        :type wav_suffix: str
        :param vad_suffix: suffix of vad files
        :type vad_suffix
    """
    loginfo('[wav2ivecs.process_file] Processing file {} ...'.format(
        file_name.split()[0]))
    ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model
    if len(file_name.split()) > 1:
        file_name = file_name.split()[0]
    wav = os.path.join(wav_dir, file_name) + wav_suffix
    rate, sig = read(wav)
    if len(sig.shape) != 1:
        raise GeneralException(
            '[wav2ivec.process_file] Expected mono as input audio.')
    if rate != 8000:
        logwarning(
            '[wav2ivecs.process_file] '
            'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.'
            .format(rate))
        sig = signal.resample(sig, 8000)
    if ADDDITHER > 0.0:
        loginfo('[wav2ivecs.process_file] Adding dither ...')
        sig = features.add_dither(sig, ADDDITHER)

    fea = get_mfccs(sig)
    vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig,
                                       fea)

    ivec_set = IvecSet()
    ivec_set.name = file_name
    for seg in get_segments(vad, min_size, max_size, tolerance):
        start, end = get_num_segments(seg[0]), get_num_segments(seg[1])
        loginfo(
            '[wav2ivecs.process_file] Processing speech segment from {} ms to {} ms ...'
            .format(start, end))
        if seg[0] > fea.shape[0] - 1 or seg[1] > fea.shape[0] - 1:
            logwarning(
                '[norm.process_file] Unexpected features dimensionality - check VAD input or audio.'
            )
            continue
        w = get_ivec(fea[seg[0]:seg[1]], numg, dimf, gmm_model, ubm_means,
                     ubm_norm, v, mvvt)
        if w is None:
            continue
        ivec_set.add(w, start, end)
    Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
    ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))