Python LatticeFasterDecoderOptions 예제들, kaldi.decoder.LatticeFasterDecoderOptions Python 예제들

예제 #1

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)
        print('Initializing aspire model...')
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150
        self.asr = NnetLatticeFasterRecognizer.from_files(
            "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/final.mdl",
            "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst",
            "/home/chris/git/pykaldi/examples/setups/aspire/data/lang/words.txt",
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)

        _, fn = tempfile.mkstemp()
        os.remove(fn)
        self.scp_fn = scp_fn = '%s.scp' % fn

        # Define feature pipelines as Kaldi rspecifiers
        self.feats_rspec = (
            f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |"
        )
        self.ivectors_rspec = (
            f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |"
            f"ivector-extract-online2 --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/ivector_extractor.conf " \
            f"ark:/home/chris/git/pykaldi/examples/setups/aspire/data/test/spk2utt ark:- ark:- |"
        )

예제 #2

0

파일 보기

    def __init__(self, scp, model, graph, words, conf, iconf, spk2utt, output, printed=False, log=False):
        """
        Инициализация транскриптора
        
        Аргументы:
            scp: путь к .SCP файлу с аудио
            model: путь к .MDL файлу модели распознавания
            graph: путь к .FST файлу общего графа распознавания
            words: путь к .TXT файлу текстового корпуса
            conf: путь к .CONF конфигурационному файлу распознавания
            iconf: путь к .CONF конфигурационному файлу векторного экстрактора
            spk2utt: путь к файлу перечисления сегментов для каждого говорящего
            output: путь к директории с результатами распознавания
            printed: признак печати результатов распознавания
            log: признак логирования
        """  
        self.scp = scp
        self.model = model
        self.graph = graph
        self.words = words
        self.conf = conf
        self.iconf = iconf
        self.spk2utt = spk2utt
        self.output = Path(output)
        self.printed = printed
        self.log = log

        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        self.asr = NnetLatticeFasterRecognizer.from_files(self.model, self.graph, self.words,
                decoder_opts=decoder_opts, decodable_opts=decodable_opts)

예제 #3

0

파일 보기

파일: nnet3_model.py 프로젝트: uhh-lt/kaldi-model-server

def load_model(config_file,
               online_config,
               models_path='models/',
               beam_size=10,
               frames_per_chunk=50):
    # Read YAML file
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)

    decoder_yaml_opts = model_yaml['decoder']

    print(decoder_yaml_opts)

    feat_opts = OnlineNnetFeaturePipelineConfig()
    endpoint_opts = OnlineEndpointConfig()

    if not os.path.isfile(online_config):
        print(online_config +
              ' does not exists. Trying to create it from yaml file settings.')
        print(
            'See also online_config_options.info.txt for what possible settings are.'
        )
        with open(online_config, 'w') as online_config_file:
            online_config_file.write("--add_pitch=False\n")
            online_config_file.write("--mfcc_config=" + models_path +
                                     decoder_yaml_opts['mfcc-config'] + "\n")
            online_config_file.write("--feature_type=mfcc\n")
            online_config_file.write(
                "--ivector_extraction_config=" + models_path +
                decoder_yaml_opts['ivector-extraction-config'] + '\n')
            online_config_file.write(
                "--endpoint.silence-phones=" +
                decoder_yaml_opts['endpoint-silence-phones'] + '\n')
    else:
        print("Loading online conf from:", online_config)

    po = ParseOptions("")
    feat_opts.register(po)
    endpoint_opts.register(po)
    po.read_config_file(online_config)
    feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = beam_size
    decoder_opts.max_active = 7000
    decodable_opts = NnetSimpleLoopedComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = frames_per_chunk
    asr = NnetLatticeFasterOnlineRecognizer.from_files(
        models_path + decoder_yaml_opts["model"],
        models_path + decoder_yaml_opts["fst"],
        models_path + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts,
        decodable_opts=decodable_opts,
        endpoint_opts=endpoint_opts)

    return asr, feat_info, decodable_opts

예제 #4

0

파일 보기

파일: recognizer.py 프로젝트: yaraCS/Arabic-Speech-Recognition

    def __initialize_decoder(self):
        #set decoding options (same as archive/config/decode.conf)
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13.0
        decoder_opts.lattice_beam = 6.0
        # decoder_opts.max_active = 7000

        # Construct recognizer
        asr = GmmLatticeFasterRecognizer.from_files(
            os.path.join(self.MODEL_DIR, "final.mdl"),
            os.path.join(self.MODEL_DIR, "graph", "HCLG.fst"),
            os.path.join(self.MODEL_DIR, "graph", "words.txt"),
            decoder_opts=decoder_opts)
        return asr

예제 #5

0

파일 보기

    def LoadModels(self):
        try:
            # Define online feature pipeline
            po = ParseOptions("")

            decoder_opts = LatticeFasterDecoderOptions()
            self.endpoint_opts = OnlineEndpointConfig()
            self.decodable_opts = NnetSimpleLoopedComputationOptions()
            feat_opts = OnlineNnetFeaturePipelineConfig()

            decoder_opts.register(po)
            self.endpoint_opts.register(po)
            self.decodable_opts.register(po)
            feat_opts.register(po)

            po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf")
            self.feat_info = OnlineNnetFeaturePipelineInfo.from_config(
                feat_opts)

            # Set metadata parameters
            self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq
            self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000
            self.acwt = self.decodable_opts.acoustic_scale

            # Load Acoustic and graph models and other files
            self.transition_model, self.acoustic_model = NnetRecognizer.read_model(
                self.AM_PATH + "/final.mdl")
            graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst")
            self.decoder_graph = LatticeFasterOnlineDecoder(
                graph, decoder_opts)
            self.symbols = _fst.SymbolTable.read_text(self.LM_PATH +
                                                      "/words.txt")
            self.info = WordBoundaryInfo.from_file(
                WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int")

            self.asr = NnetLatticeFasterOnlineRecognizer(
                self.transition_model,
                self.acoustic_model,
                self.decoder_graph,
                self.symbols,
                decodable_opts=self.decodable_opts,
                endpoint_opts=self.endpoint_opts)
            del graph, decoder_opts
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "AM and LM loading failed!!! (see logs for more details)")

예제 #6

0

파일 보기

파일: decode.py 프로젝트: hlthu/pykaldi

#!/usr/bin/env python

from __future__ import print_function

from kaldi.asr import NnetLatticeFasterRecognizer
from kaldi.decoder import LatticeFasterDecoderOptions
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterRecognizer.from_files(
    "exp/tdnn_7b_chain_online/final.mdl",
    "exp/tdnn_7b_chain_online/graph_pp/HCLG.fst",
    "exp/tdnn_7b_chain_online/graph_pp/words.txt",
    decoder_opts=decoder_opts,
    decodable_opts=decodable_opts)

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |")
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |"
    "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |"
)

예제 #7

0

파일 보기

    def main(self):
        # Construct recognizer
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150
        asr = NnetLatticeFasterRecognizer.from_files(
            self.dir_path + "/exp/tdnn_7b_chain_online/final.mdl",
            self.dir_path + "/new/graph/HCLG.fst",
            self.dir_path + "/new/graph/words.txt",
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)

        p = pyaudio.PyAudio()

        # ############################################
        # sentiment_analyzer = SentimentAnalyzer(self.dir_path)
        # model = load_model(self.dir_path + '/lstm.h5')
        ############################################

        stream = p.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)

        print("* recording")

        audio2send = []
        cur_data = ''  # current chunk  of audio data
        rel = RATE / CHUNK
        slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1)
        # Prepend audio from 0.5 seconds before noise was detected
        prev_audio = deque(maxlen=int(PREV_AUDIO * rel) + 1)
        started = False
        n = num_phrases
        response = []

        while num_phrases == -1 or n > 0:
            cur_data = stream.read(CHUNK)
            slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
            # print slid_win[-1]
            if sum([x > THRESHOLD for x in slid_win]) > 0:
                if (not started):
                    # print "Starting record of phrase"
                    started = True
                audio2send.append(cur_data)
            elif (started is True):
                # print "Finished"
                # The limit was reached, finish capture and deliver.
                filename = self.save_speech(list(prev_audio) + audio2send, p, self.SAVE_PATH, self.WAVE_OUTPUT_FILENAME)
                # Send file to Google and get response
                r = self.recognize_speech(asr)
                if num_phrases == -1:
                    print("Detected speech: ", r)
                    # if r != None:
                        # sentiment_analyzer.get_sentiment(r, model)
                else:
                    response.append(r)
                # Remove temp file. Comment line to review.
                os.remove(filename)
                # Reset all
                started = False
                slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1)
                prev_audio = deque(maxlen=int(0.5 * rel) + 1)
                audio2send = []
                n -= 1
            else:
                prev_audio.append(cur_data)

        stream.stop_stream()
        stream.close()
        p.terminate()

예제 #8

0

파일 보기

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-data", help="data yaml file")
    parser.add_argument("-data_path",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-seed_model", help="the seed nerual network model")
    parser.add_argument("-exp_dir", help="the directory to save the outputs")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument("-criterion",
                        type=str,
                        choices=["mmi", "mpfe", "smbr"],
                        help="set the sequence training crtierion")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument(
        "-prior_path",
        help="the prior for decoder, usually named as final.occs in kaldi setup"
    )
    parser.add_argument(
        "-den_dir",
        help="the decoding graph directory to find HCLG and words.txt files")
    parser.add_argument("-lr", type=float, help="set the learning rate")
    parser.add_argument("-ce_ratio",
                        default=0.1,
                        type=float,
                        help="the ratio for ce regularization")
    parser.add_argument("-momentum",
                        default=0,
                        type=float,
                        help="set the momentum")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=100,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument('-print_freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('-save_freq',
                        default=1000,
                        type=int,
                        metavar='N',
                        help='save model frequency (default: 1000)')

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config['data_path'] = args.data_path

    config["sweep_size"] = args.sweep_size

    print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.data_loader_threads,
                                     distributed=True,
                                     test_only=False)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(),
                             lr=args.lr,
                             momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)
        state_dict = checkpoint['model']
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove 'module.' of dataparallel
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
        print("=> loaded checkpoint '{}' ".format(args.seed_model))
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n' %
                         (model_file))
        sys.exit(0)

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if not os.path.isfile(silence_phones):
        sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' %
                         (silence_phones))
        sys.exit(0)
    with open(silence_phones) as f:
        silence_ids = [int(i) for i in f.readline().strip().split(':')]
        f.close()

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = False  #To produce raw state-level lattice instead of compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    prior = kaldi_util.io.read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    model.train()
    for epoch in range(args.num_epochs):

        run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader,
                        epoch, asr_decoder, trans_model, silence_ids, args)

        # save model
        if hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)

예제 #9

0

파일 보기

파일: nnet_decoder.py 프로젝트: Ralakus/kaldi-demo

    def init(self, nnet_directory, transcription_directory):
        return_msg = "KaldiDecoder:init"
        debug_data = []
        feats = ""
        ivectors = ""
        decoder_opts = None
        decodable_opts = None
        asr = None

        ## input validation
        if nnet_directory is not None:
            if type(nnet_directory) is not str:
                return_msg += "nnet_directory is not of type string, is type {}".format(
                    type(nnet_directory))
                return {
                    RDK.success: RC.input_validation,
                    RDK.return_msg: return_msg,
                    RDK.debug_data: debug_data
                }
        else:
            nnet_directory = KaldiNnetDecoder.CV_default_nnet_directory

        if transcription_directory is not None:
            if type(transcription_directory) is not str:
                return_msg += "transcription_directory is not of type string, is type {}".format(
                    type(transcription_directory))
                return {
                    RDK.success: RC.input_validation,
                    RDK.return_msg: return_msg,
                    RDK.debug_data: debug_data
                }
        else:
            transcription_directory = KaldiNnetDecoder.CV_default_transcription_directory
        ##</end> input validation

        ## feats and ivector rspec creation
        feats = (
            "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |"
        ).format(nnet_directory, transcription_directory)

        ivectors = (
            "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |"
            "ivector-extract-online2 --config={0}/conf/ivector_extractor.conf ark:{1}/spk2utt ark:- ark:- |"
        ).format(nnet_directory, transcription_directory)
        ##</end> feats and ivector rspec creation

        ## asr creation
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000

        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150

        asr = NnetLatticeFasterRecognizer.from_files(
            "{}/final.mdl".format(nnet_directory),
            "{}/graph/HCLG.fst".format(nnet_directory),
            "{}/graph/words.txt".format(nnet_directory),
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)
        ##</end> asr creation

        self.IV_feats = feats
        self.IV_ivectors = ivectors
        self.IV_asr = asr
        self.IV_is_ready = True

        return {
            RDK.success: RC.success,
            RDK.return_msg: return_msg,
            RDK.debug_data: debug_data
        }

예제 #10

0

파일 보기

파일: subtitle2go.py 프로젝트: entn-at/subtitle2go

def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000):
    models_dir = "models/"

    # Read yaml File
    config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml"
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)
    decoder_yaml_opts = model_yaml['decoder']

    scp_filename = "tmp/%s.scp" % filenameS_hash
    wav_filename = "tmp/%s.wav" % filenameS_hash
    spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash

    # write scp file
    with open(scp_filename, 'w') as scp_file:
        scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash))

    # write scp file
    with open(spk2utt_filename, 'w') as scp_file:
        scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash))

    # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono
    (
        ffmpeg
            .input(filename)
            .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .run()
    )

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = asr_beamsize
    decoder_opts.max_active = asr_max_active
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = 150
    asr = NnetLatticeFasterRecognizer.from_files(
        models_dir + decoder_yaml_opts["model"],
        models_dir + decoder_yaml_opts["fst"],
        models_dir + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts, decodable_opts=decodable_opts)

    # Construct symbol table
    symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"])
    phi_label = symbols.find_index("#0")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \
                  (models_dir + decoder_yaml_opts["mfcc-config"])
    ivectors_rspec = (
            ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-"
             + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") %
            ((models_dir + decoder_yaml_opts["mfcc-config"]),
             (models_dir + decoder_yaml_opts["ivector-extraction-config"]))
    )

    did_decode = False
    # Decode wav files
    with SequentialMatrixReader(feats_rspec) as f, \
            SequentialMatrixReader(ivectors_rspec) as i:
        for (fkey, feats), (ikey, ivectors) in zip(f, i):
            did_decode = True
            assert (fkey == ikey)
            out = asr.decode((feats, ivectors))
            best_path = functions.compact_lattice_shortest_path(out["lattice"])
            words, _, _ = get_linear_symbol_sequence(shortestpath(best_path))
            timing = functions.compact_lattice_to_word_alignment(best_path)

    assert(did_decode)

    # Maps words to the numbers
    words = indices_to_symbols(symbols, timing[0])

    # Creates the datastructure (Word, begin(Frames), end(Frames))
    vtt = list(map(list, zip(words, timing[1], timing[2])))

    # Cleanup tmp files
    print('removing tmp file:', scp_filename)
    os.remove(scp_filename)
    print('removing tmp file:', wav_filename)
    os.remove(wav_filename)
    print('removing tmp file:', spk2utt_filename)
    os.remove(spk2utt_filename)
    return vtt, words

예제 #11

0

파일 보기

    acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary)


# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper


decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)

# Define the decoder
decoding_graph = read_fst_kaldi("models/mono/graph/HCLG.fst")
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13.0
decoder_opts.lattice_beam = 6.0
decoder = LatticeFasterDecoder(decoding_graph, decoder_opts)

# Define the recognizer
symbols = SymbolTable.read_text("models/mono/graph/words.txt")
asr = Recognizer(decoder, decodable_wrapper, symbols)

# Decode wave files
# for key, wav in SequentialWaveReader("scp:wav.scp"):
# feats = feat_pipeline(wav)
# out = asr.decode(feats)
# print(key, out["text"], flush=True)

예제 #12

0

파일 보기

파일: latgen.py 프로젝트: zhangfeifan6428/pykaldi2

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data_path")
    parser.add_argument("-prior_path",
                        help="the path to load the final.occs file")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument("-graph_dir", help="the decoding graph directory")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data_path

    config["source_paths"].append(data_config)

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    #data = trainset.__getitem__(0)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    HCLG = args.graph_dir + "/HCLG.fst"
    words_txt = args.graph_dir + "/words.txt"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    prior = read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = True  #To produce compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    model.eval()
    with th.no_grad():
        with kaldi_util.table.CompactLatticeWriter("ark:" +
                                                   args.out_file) as lat_out:
            for data in test_dataloader:
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                x = x.cuda()

                prediction = model(x)

                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()

                    loglikes_j = loglikes[:num_frs[j], :]
                    loglikes_j = loglikes_j - log_prior

                    decoder_out = asr_decoder.decode(
                        kaldi_matrix.Matrix(loglikes_j.numpy()))

                    key = utt_ids[j][0]
                    print(key, decoder_out["text"])

                    print("Log-like per-frame for utterance {} is {}".format(
                        key, decoder_out["likelihood"] / num_frs[j]))

                    # save lattice
                    lat_out[key] = decoder_out["lattice"]