예제 #1
0
    def recognize(self, wav=None):
        """
        Распознавание речи       
        
        Аргументы:
            wav: наименование аудио файла

        Результат:
            transcriptions: путь к файлу транскрибации
        """
        transcriptions = str(self.output / wav) if wav else 'transcriptions'
        feats_rspec = ("ark:compute-mfcc-feats --config=" + self.conf + " scp:" + self.scp + " ark:- |")
        ivectors_rspec = (feats_rspec + "ivector-extract-online2 "
                        "--config=" + self.iconf + " "
                        "ark:" + self.spk2utt + " ark:- ark:- |")
        lat_wspec = "ark:| gzip -c > lat.gz"   
        with SequentialMatrixReader(feats_rspec) as feats_reader, \
            SequentialMatrixReader(ivectors_rspec) as ivectors_reader, \
            CompactLatticeWriter(lat_wspec) as lat_writer:
            for (fkey, feats), (ikey, ivectors) in zip(feats_reader, ivectors_reader):
                assert(fkey == ikey)
                out = self.asr.decode((feats, ivectors))
                lat_writer[fkey] = out['lattice']
                if self.printed:
                    print(fkey, out['text'], flush=True)
                with open(transcriptions, 'a') as f:
                    f.write(fkey + '\t' + out['text'].lower() + '\n')
        return transcriptions
예제 #2
0
 def audio_to_text(self, fn):
     fn = os.path.abspath(fn)
     print('Processing %s.' % fn)
     with open(self.scp_fn, 'w') as fout:
         fout.write('utt1 %s' % fn)
     with SequentialMatrixReader(
             self.feats_rspec) as f, SequentialMatrixReader(
                 self.ivectors_rspec
             ) as i:  #, open("out/test/decode.out", "w") as o:
         for (key, feats), (_, ivectors) in zip(f, i):
             time_start = time()
             out = self.asr.decode((feats, ivectors))
             td = time() - time_start
             # print(key, out["text"], file=o)
             print('out:', out)
             likelihood = out['likelihood']
             certainty = likelyhood_to_certainty(likelihood)
             print('certainty:', certainty)
             logging.info("%s decoding took %8.2fs, certainty: %f", fn, td,
                          certainty)
             text = out["text"]
             print('text:', text)
             text = re.sub(r'\[[^\]]+\]', '', text)  # remove "[noise]"
             text = re.sub(r'[ ]+', ' ', text)  # collapse whitespace
             print('text2:', text)
             return text
예제 #3
0
    def decode(self):
        return_msg = "KaldiDecoder:decode"
        debug_data = []
        transcriptions = []

        ## init check
        if self.IV_is_ready is False:
            return_msg += "KaldiDecoder has not been initialized"
            return {
                RDK.success: RC.success,
                RDK.return_msg: return_msg,
                RDK.debug_data: debug_data,
                "transcriptions": transcriptions
            }
        ##</end> init check

        ## decoding
        with SequentialMatrixReader(
                self.IV_feats) as f_rspec, SequentialMatrixReader(
                    self.IV_ivectors) as iv_rspec:
            for (_, feats), (_, ivectors) in zip(f_rspec, iv_rspec):
                out = self.IV_asr.decode((feats, ivectors))
                transcriptions.append(out["text"])
        ##</end> decoding

        return {
            RDK.success: RC.success,
            RDK.return_msg: return_msg,
            RDK.debug_data: debug_data,
            "transcriptions": transcriptions
        }
예제 #4
0
    def recognize_speech(self, asr):
        # Define feature pipelines as Kaldi rspecifiers
        feats_rspec = (
            "ark:compute-mfcc-feats --config=" + self.dir_path + "/new/conf/mfcc_hires.conf scp:" + self.dir_path + "/data/test/wav.scp ark:- |"
        )
        ivectors_rspec = (
            "ark:compute-mfcc-feats --config=" + self.dir_path + "/new/conf/mfcc_hires.conf scp:" + self.dir_path + "/data/test/wav.scp ark:- |"
            "ivector-extract-online2 --config=" + self.dir_path + "/new/conf/ivector_extractor.conf ark:" + self.dir_path + "/data/test/spk2utt ark:- ark:- |"
        )

        # Decode wav files
        with SequentialMatrixReader(feats_rspec) as f, \
                SequentialMatrixReader(ivectors_rspec) as i, \
                open(self.dir_path + "/out/test/decode.out", "a+") as o:
            for (key, feats), (_, ivectors) in zip(f, i):
                out = asr.decode((feats, ivectors))
                print(out["text"], file=o)
                # print("Detected text: ", out["text"])
                return out["text"]
    def evaluate(self, data_path, remove_scp=True):
        """
        This method is used to decode a wav file/directory.
        Parameters:
            - data_path (string): full path of the data directory containing
              wav files to be decoded, or just a wav-file.
            - remove_scp (bool): remove transcription file wav.scp that is created
              for decoding.
        Returns:
            - accuracy (int): The accuracy of the the model over these wav files.
            (In case of wav.scp contains just one file)
            - Returns the true word & predicted word
            (In case of wav.scp contains more than one file)
            - model_decoded (csv file): It also returns a csv file where a more detailed
              results about the decoding process can be found!!
        """
        #create transcription file
        self.__create_transcription(data_path)
        #create pipeline
        pipeline = self.__make_feat_pipeline()
        #words of the data
        WORDS = [
            "صفر", "واحد", "إثنان", "ثلاثة", "أربعة", "خمسة", "ستة", "سبعة",
            "ثمانية", "تسعة", "التنشيط", "التحويل", "الرصيد", "التسديد", "نعم",
            "لا", "التمويل", "البيانات", "الحساب", "إنهاء"
        ]
        #start decoding
        with open("wav.scp", "r") as fin:
            num_wavs = len(fin.readlines())

        with open("{}_decoding.csv".format(self.MODEL_NAME), 'w') as fout:
            #write csv header
            fout.write("{},{},{},{}\n".format("Filename", "TrueWord",
                                              "Predicted", "Likelihood"))
            correct = 0.
            #iterate over wav features
            for key, feats in SequentialMatrixReader(pipeline):
                true_word_id = int(key.split(".")[-1]) - 1
                true_word = WORDS[true_word_id]
                out = self.ASR.decode(feats)
                if num_wavs > 1:
                    fout.write("{},{},{},{}\n".format(key, true_word,
                                                      out["text"],
                                                      out["likelihood"]))
                #was it correct??
                if true_word == out["text"]:
                    correct += 1.
            if num_wavs == 1:
                print("TrueWord:", true_word)
                print("PredictedWord:", out["text"])
                print("Likelihood:", out["likelihood"])
        #remove wav.scp
        os.remove("wav.scp")
        return correct / num_wavs
예제 #6
0
def build_response(audio_file):
    """
        1. Computes MFCC features required for SAD module
        2. Performs SAD and updates segments based on min segment duration
        3. Performs decoding sequentially
        4. Returns segment-wise response of the complete audio
    """
    utt_key = audio_file.split("/")[-1].replace(".wav", "")
    feats_rspec = "ark:compute-mfcc-feats --config=sad/mfcc_hires.conf 'scp:echo " + utt_key + " " + audio_file + " |' ark:- |"
    feats = SequentialMatrixReader(feats_rspec)
    key = feats.key()
    out = sad.segment(feats.value())
    segments, stats = seg.process(out["alignment"])
    segments = seg.merge_consecutive_segments(segments, stats)
    duration = 0
    data_dict = {}
    for segment in segments:
        start_time = float(segment[0]) / 100
        end_time = float(segment[1]) / 100
        duration += (end_time - start_time)
    return segments, duration
예제 #7
0
def feat_to_post(feature_rspecifier, posterior_wspecifier, top_n=10):
    with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
            PosteriorWriter(posterior_wspecifier) as posterior_writer:
        for uttid, feat in feature_reader:
            feat_np = feat.numpy()
            posts_lst = []
            assert top_n <= feat_np.shape[1]
            for row in feat_np:
                idxs = np.argpartition(row, -top_n)[-top_n:]
                post = [(int(idx), float(row[idx])) for idx in idxs]
                posts_lst.append(post)

            posterior_writer[uttid] = Posterior().from_posteriors(posts_lst)
    return True
예제 #8
0
def post_to_count(feature_rspecifier, cnt_wspecifier, normalize=False, per_utt=False):
  with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
          VectorWriter(cnt_wspecifier) as cnt_writer:
      if per_utt:
        for uttid, feat in feature_reader:
          cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0))
      else:
        vec = 0
        num_done = 0
        for uttid, feat in feature_reader:
          vec = vec + feat.numpy().mean(axis=0)
          num_done = num_done + 1
        if normalize:
          vec = vec / num_done
        cnt_writer[str(num_done)] = Vector(vec)
  return True
예제 #9
0
 def segment(self):
     """
     Выполнение сегментации
     
     Результат:
         segments: путь к файлу описания сегментов
     """
     feats_rspec = "ark:compute-mfcc-feats --verbose=0 --config=" + self.conf + " scp:" + self.scp + " ark:- |"
     segments = str(self.output / 'segments')
     with SequentialMatrixReader(feats_rspec) as f, open(segments,
                                                         'w') as s:
         for key, feats in f:
             out = self.sad.segment(feats)
             segs, _ = self.seg.process(out['alignment'])
             self.seg.write(key, segs, s)
             logging.info("Сегментирован файл '" + key + "'")
     return segments
예제 #10
0
def feat_to_count(feature_rspecifier,
                  cnt_wspecifier,
                  normalize=False,
                  per_utt=False):
    with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
            VectorWriter(cnt_wspecifier) as cnt_writer:
        if per_utt:
            for uttid, feat in feature_reader:
                cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0))
        else:
            vec = 0
            num_done = 0
            for uttid, feat in feature_reader:
                vec = vec + feat.numpy().mean(axis=0)
                num_done = num_done + 1
            if normalize:
                vec = vec / num_done
            # post = zip(range(len(vec)), vec.tolist())
            # posterior_writer[str(num_done)] = Posterior().from_posteriors([post])
            cnt_writer[str(num_done)] = Vector(vec)
    return True
예제 #11
0
def feat_to_post(feature_rspecifier, posterior_wspecifier, top_n=10, rescale=False):
  assert top_n >= 1
  with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
          PosteriorWriter(posterior_wspecifier) as posterior_writer:
    for uttid, feat in feature_reader:
      feat_np = feat.numpy()
      posts_lst = []
      assert top_n <= feat_np.shape[1]
      for row in feat_np:
        idxs = np.argpartition(row, -top_n)[-top_n:]
        if not rescale:
          post = [(int(idx), float(row[idx])) for idx in idxs]
        else:
          post_candidates = [float(row[idx]) for idx in idxs]
          sum_post = sum(post_candidates)
          if 0 == sum_post:
            post = [(int(idx), 1./len(idxs)) for idx in idxs]
          else:
            post = [(int(idx), post_candidates[idx]/sum_post) for idx in idxs]
        posts_lst.append(post)
      posterior_writer[uttid] = Posterior().from_posteriors(posts_lst)
  return True
예제 #12
0
            mat.apply_exp_()

        if opts.apply_softmax_per_row:
            apply_softmax_per_row(mat)

        if opts.apply_power != 1.0:
            mat.apply_power_(opts.apply_power)

        with Output(matrix_out_fn, opts.binary) as ko:
            mat.write(ko.stream(), opts.binary)

        logging.info("Copied matrix to {}".format(matrix_out_fn))

    else:
        with MatrixWriter(matrix_out_fn) as writer, \
             SequentialMatrixReader(matrix_in_fn) as reader:
            for num_done, (key, mat) in enumerate(reader):

                if opts.scale != 1.0 or\
                   opts.apply_log or\
                   opts.apply_exp or\
                   opts.apply_power != 1.0 or\
                   opts.apply_softmax_per_row:

                    if opts.scale != 1.0:
                        mat.scale_(opts.scale)

                    if opts.apply_log:
                        mat.apply_floor_(1.0e-20)
                        mat.apply_log_()
예제 #13
0
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 11.0
decoder_opts.max_active = 7000
asr = GmmLatticeFasterRecognizer.from_files("final.mdl",
                                            "HCLG.fst",
                                            "words.txt",
                                            decoder_opts=decoder_opts)

# Define feature pipeline as a Kaldi rspecifier
feats_rspecifier = (
    "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-"
    " | apply-cmvn-sliding --cmn-window=10000 --center=true ark:- ark:-"
    " | add-deltas ark:- ark:- |")

# Decode
for key, feats in SequentialMatrixReader(feats_rspecifier):
    out = asr.decode(feats)
    print(key, out["text"], flush=True)

print("-" * 80, flush=True)


# Define feature pipeline in code
def make_feat_pipeline(base, opts=DeltaFeaturesOptions()):
    def feat_pipeline(wav):
        feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0)
        cmvn = Cmvn(base.dim())
        cmvn.accumulate(feats)
        cmvn.apply(feats)
        return compute_deltas(opts, feats)
예제 #14
0
def utt_generator(align_rspec, feats_rspec, shuffle, args):
    """
    Args:
        align_rspec: kaldi style read rspecifier for alignment
        feats_rspec: kaldi stule read rspecifier for feature
        shuffle: deprecated
        args: arguments
    """
    ali_reader = SequentialIntVectorReader(align_rspec)
    feats_reader = SequentialMatrixReader(feats_rspec)
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)
    start_flag = torch.IntTensor([1] * batch_size)

    if args.cuda:
        start_flag = start_flag.cuda(args.local_rank)

    batch_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1
    for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader):
        assert uttid2 == uttid
        ali = np.array(ali)
        feats = _matrix_ext.matrix_to_numpy(feats)
        utt_len = feats.shape[0] // args.stride + int(
            feats.shape[0] % args.stride != 0)
        #ali/targets should be shorter
        #assert ali.shape[0] <= utt_len
        ali_len[batch_idx] = ali.shape[0]
        data_buffer[batch_idx, :utt_len, :] = \
            splice(feats, args.lctx, args.rctx)[::args.stride]
        target_buffer[batch_idx, :ali_len[batch_idx]] = ali
        #target_len += ali_len[batch_idx]
        len_buffer[batch_idx] = utt_len

        if utt_len > batch_max_len:
            batch_max_len = utt_len

        if ali_len[batch_idx] > target_max_len:
            target_max_len = ali_len[batch_idx]

        batch_idx += 1

        if batch_idx == batch_size:
            for b in range(batch_size):
                utt_len = len_buffer[b]
                target_len = ali_len[b]
                #data and target padding
                data_buffer[b, utt_len:batch_max_len, :] = \
                    data_buffer[b, utt_len-1, :]
                target_buffer[b, target_len:target_max_len] = args.padding_tgt

            data = data_buffer[:, :batch_max_len, :]
            target = target_buffer[:, :target_max_len]

            if not args.batch_first:
                data = np.transpose(data, (1, 0, 2))
                target = np.transpose(target, (1, 0))

            data = np.copy(data)
            target = np.copy(target)
            lens = np.copy(len_buffer)
            ali_lens = np.copy(ali_len)

            data = torch.from_numpy(data)
            target = torch.from_numpy(target).long()

            if args.cuda:
                data = data.cuda(args.local_rank)
                target = target.cuda(args.local_rank)
            yield Variable(data), Variable(target), lens, ali_lens

            batch_idx = 0
            target_len = 0
            batch_max_len = -1
            target_max_len = -1

    yield None
예제 #15
0
def gmm_decode_faster(model_rxfilename, fst_rxfilename,
                      feature_rspecifier, words_wspecifier,
                      alignment_wspecifier="", lattice_wspecifier="",
                      word_symbol_table="", acoustic_scale=0.1,
                      allow_partial=True, decoder_opts=FasterDecoderOptions()):
    # Read model.
    trans_model = TransitionModel()
    am_gmm = AmDiagGmm()
    with xopen(model_rxfilename) as ki:
        trans_model.read(ki.stream(), ki.binary)
        am_gmm.read(ki.stream(), ki.binary)

    # Open table readers/writers.
    feature_reader = SequentialMatrixReader(feature_rspecifier)
    words_writer = IntVectorWriter(words_wspecifier)
    alignment_writer = IntVectorWriter(alignment_wspecifier)
    clat_writer = CompactLatticeWriter(lattice_wspecifier)

    # Read symbol table.
    word_syms = None
    if word_symbol_table != "":
        word_syms = SymbolTable.read_text(word_symbol_table)
        if not word_syms:
            raise RuntimeError("Could not read symbol table from file {}"
                               .format(word_symbol_table))

    # NOTE:
    # It is important to read decode_fst after opening feature reader as
    # it can prevent crashes on systems without enough virtual memory.

    # Read decoding graph and instantiate decoder.
    decode_fst = read_fst_kaldi(fst_rxfilename)
    decoder = FasterDecoder(decode_fst, decoder_opts)

    tot_like = 0.0
    frame_count = 0
    num_success, num_fail = 0, 0
    start = time.time()

    for key, features in feature_reader:
        if features.num_rows == 0:
            num_fail += 1
            logging.warning("Zero-length utterance: {}".format(key))
            continue

        gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model,
                                                 features, acoustic_scale)
        decoder.decode(gmm_decodable)

        if not (allow_partial or decoder.reached_final()):
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        try:
            best_path = decoder.get_best_path()
        except RuntimeError:
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        if not decoder.reached_final():
            logging.warning("Decoder did not reach end-state, outputting "
                            "partial traceback since --allow-partial=true")

        ali, words, weight = get_linear_symbol_sequence(best_path)

        words_writer[key] = words

        if alignment_writer.is_open():
            alignment_writer[key] = ali

        if clat_writer.is_open():
            if acoustic_scale != 0.0:
                scale = acoustic_lattice_scale(1.0 / acoustic_scale)
                scale_lattice(scale, best_path)
            best_path = convert_lattice_to_compact_lattice(best_path)
            clat_writer[key] = best_path

        if word_syms:
            syms = convert_indices_to_symbols(word_syms, words)
            print(key, " ".join(syms), file=sys.stderr)

        num_success += 1
        frame_count += features.num_rows
        like = - (weight.value1 + weight.value2);
        tot_like += like
        logging.info("Log-like per frame for utterance {} is {} over {} "
                     "frames.".format(key, like / features.num_rows,
                                      features.num_rows))
        logging.debug("Cost for utterance {} is {} + {}"
                      .format(key, weight.value1, weight.value2))

    elapsed = time.time() - start
    logging.info("Time taken [excluding initialization] {}s: real-time factor "
                 "assuming 100 frames/sec is {}"
                 .format(elapsed, elapsed * 100 / frame_count))
    logging.info("Done {} utterances, failed for {}"
                 .format(num_success, num_fail))
    logging.info("Overall log-likelihood per frame is {} over {} frames."
                 .format(tot_like / frame_count, frame_count))

    feature_reader.close()
    words_writer.close()
    if alignment_writer.is_open():
        alignment_writer.close()
    if clat_writer.is_open():
        clat_writer.close()

    return True if num_success != 0 else False
예제 #16
0
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000):
    models_dir = "models/"

    # Read yaml File
    config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml"
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)
    decoder_yaml_opts = model_yaml['decoder']

    scp_filename = "tmp/%s.scp" % filenameS_hash
    wav_filename = "tmp/%s.wav" % filenameS_hash
    spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash

    # write scp file
    with open(scp_filename, 'w') as scp_file:
        scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash))

    # write scp file
    with open(spk2utt_filename, 'w') as scp_file:
        scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash))

    # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono
    (
        ffmpeg
            .input(filename)
            .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .run()
    )

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = asr_beamsize
    decoder_opts.max_active = asr_max_active
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = 150
    asr = NnetLatticeFasterRecognizer.from_files(
        models_dir + decoder_yaml_opts["model"],
        models_dir + decoder_yaml_opts["fst"],
        models_dir + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts, decodable_opts=decodable_opts)

    # Construct symbol table
    symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"])
    phi_label = symbols.find_index("#0")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \
                  (models_dir + decoder_yaml_opts["mfcc-config"])
    ivectors_rspec = (
            ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-"
             + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") %
            ((models_dir + decoder_yaml_opts["mfcc-config"]),
             (models_dir + decoder_yaml_opts["ivector-extraction-config"]))
    )

    did_decode = False
    # Decode wav files
    with SequentialMatrixReader(feats_rspec) as f, \
            SequentialMatrixReader(ivectors_rspec) as i:
        for (fkey, feats), (ikey, ivectors) in zip(f, i):
            did_decode = True
            assert (fkey == ikey)
            out = asr.decode((feats, ivectors))
            best_path = functions.compact_lattice_shortest_path(out["lattice"])
            words, _, _ = get_linear_symbol_sequence(shortestpath(best_path))
            timing = functions.compact_lattice_to_word_alignment(best_path)

    assert(did_decode)

    # Maps words to the numbers
    words = indices_to_symbols(symbols, timing[0])

    # Creates the datastructure (Word, begin(Frames), end(Frames))
    vtt = list(map(list, zip(words, timing[1], timing[2])))

    # Cleanup tmp files
    print('removing tmp file:', scp_filename)
    os.remove(scp_filename)
    print('removing tmp file:', wav_filename)
    os.remove(wav_filename)
    print('removing tmp file:', spk2utt_filename)
    os.remove(spk2utt_filename)
    return vtt, words
예제 #17
0
def ctc_utt_generator(align_rspec, feats_rspec, shuffle, args):
    """
    we do not really need 'target' generated
    in MMI/sMBR training from this generator
    so the interface is adjusted to fullfill
    warp_ctc for CTC training, target is now
    a tuple of (label, label_size).
    """
    ali_reader = SequentialIntVectorReader(align_rspec)
    feats_reader = SequentialMatrixReader(feats_rspec)
    max_len = args.max_len
    batch_size = args.batch_size

    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size * max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)
    start_flag = torch.IntTensor([1] * batch_size)

    if args.cuda:
        start_flag = start_flag.cuda(args.local_rank)

    batch_idx = 0
    target_len = 0
    batch_max_len = -1

    #!!!make sure feature and ali
    #!!!has exact the same  order
    for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader):
        assert uttid2 == uttid
        ali = np.array(ali)
        feats = _matrix_ext.matrix_to_numpy(feats)
        #in CTC training, the ali is shorter
        utt_len = feats.shape[0] // args.stride + \
                  int(feats.shape[0] % args.stride != 0)
        assert ali.shape[0] <= utt_len

        ali_len[batch_idx] = ali.shape[0]
        data_buffer[batch_idx, :utt_len, :] = splice(feats, args.lctx,
                                                     args.rctx)[::args.stride]
        target_buffer[target_len:target_len + ali_len[batch_idx]] = ali
        target_len += ali_len[batch_idx]
        len_buffer[batch_idx] = utt_len

        if utt_len > batch_max_len:
            batch_max_len = utt_len

        batch_idx += 1

        if batch_idx == batch_size:
            for b in range(batch_size):
                utt_len = len_buffer[b]
                data_buffer[b, utt_len:batch_max_len, :] = 0
                #target_buffer[b, ali_len[b]:batch_max_len]  = -1

            data = data_buffer[:, :batch_max_len, :]
            target = target_buffer[:target_len]

            if not args.batch_first:
                data = np.transpose(data, (1, 0, 2))
                #target = np.transpose(target, (1, 0))

            data = np.copy(data)
            target = np.copy(target)
            lens = np.copy(len_buffer)
            ali_lens = np.copy(ali_len)

            data = torch.from_numpy(data)
            target = torch.from_numpy(target)

            if args.cuda:
                data, target = data.cuda(args.local_rank), target

            yield Variable(data), (Variable(target),
                                   ali_lens), lens, start_flag

            batch_idx = 0
            target_len = 0
            batch_max_len = -1

    yield None
예제 #18
0
# Construct aligner
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
aligner = NnetAligner.from_files("final.mdl", "tree", "L.fst", "words.txt",
                                 "disambig.int", decodable_opts=decodable_opts)
phones = SymbolTable.read_text("phones.txt")
wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                     "word_boundary.int")

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |"
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-"
    " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |"
    )

# Align wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i, open("text") as t:
    for (fkey, feats), (ikey, ivectors), line in zip(f, i, t):
        tkey, text = line.strip().split(None, 1)
        assert(fkey == ikey == tkey)
        out = aligner.align((feats, ivectors), text)
        print(fkey, out["alignment"], flush=True)
        phone_alignment = aligner.to_phone_alignment(out["alignment"], phones)
        print(fkey, phone_alignment, flush=True)
        word_alignment = aligner.to_word_alignment(out["best_path"], wb_info)
        print(fkey, word_alignment, flush=True)
예제 #19
0
                                      nr - prev_num_frames_computed).numpy()
                llhs.append(x)
                prev_num_frames_computed = nr
            asr.advance_decoding()
            num_frames_decoded = asr.decoder.num_frames_decoded()
            if not last_chunk:
                if num_frames_decoded > prev_num_frames_decoded:
                    prev_num_frames_decoded = num_frames_decoded
                    out = asr.get_partial_output()
                    print(key + "-part%d" % part, out["text"], flush=True)
                    part += 1
        asr.finalize_decoding()
        out = asr.get_output()
        print(key + "-final", out["text"], flush=True)

        llout[key] = numpy.concatenate(llhs, axis=0)

# Do it again, Sam, but perhaps with a different HCLG.fst

# Decode log-likelihoods stored as kaldi matrices.
asr = MappedLatticeFasterRecognizer.from_files("final.mdl",
                                               "HCLG.fst",
                                               "words.txt",
                                               acoustic_scale=1.0,
                                               decoder_opts=decoder_opts)

with SequentialMatrixReader("ark:loglikes.ark") as llin:
    for key, loglikes in llin:
        out = asr.decode(loglikes)
        print(key + '-fromllhs', out["text"], flush=True)
예제 #20
0
# Construct aligner
aligner = GmmAligner.from_files(
    "gmm-boost-silence --boost=1.0 1 final.mdl - |",
    "tree",
    "L.fst",
    "words.txt",
    "disambig.int",
    self_loop_scale=0.1)
phones = SymbolTable.read_text("phones.txt")
wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                     "word_boundary.int")

# Define feature pipeline as a Kaldi rspecifier
feats_rspecifier = (
    "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-"
    " | apply-cmvn-sliding --cmn-window=10000 --center=true ark:- ark:-"
    " | add-deltas ark:- ark:- |")

# Align
with SequentialMatrixReader(feats_rspecifier) as f, open("text") as t:
    for (fkey, feats), line in zip(f, t):
        tkey, text = line.strip().split(None, 1)
        assert (fkey == tkey)
        out = aligner.align(feats, text)
        print(fkey, out["alignment"], flush=True)
        phone_alignment = aligner.to_phone_alignment(out["alignment"], phones)
        print(fkey, phone_alignment, flush=True)
        word_alignment = aligner.to_word_alignment(out["best_path"], wb_info)
        print(fkey, word_alignment, flush=True)
예제 #21
0
        for wav_path in wav_files:
            _, wav_filename = path.split(wav_path)
            wav_filename = wav_filename[:-4] #remove extension
            fout.write("{} {}\n".format(wav_filename, wav_path))

#Generate scp file
create_scp(args["input"])

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.lattice_beam = 6.0
decoder_opts.max_active = 7000
        
KALDI_DIR = args["kaldiroot"]
TYPE = args["type"]
MODEL_ROOT = args["modelroot"]
MODEL_DIR = path.join(MODEL_ROOT, "exp", TYPE) 

asr = GmmLatticeFasterRecognizer\
        .from_files(
            path.join(MODEL_DIR, "final.mdl"),
            path.join(MODEL_DIR, "graph", "HCLG.fst"),
            path.join(MODEL_DIR, "graph", "words.txt"),
            decoder_opts=decoder_opts)

with SequentialMatrixReader(feat_pipeline(TYPE)) as f:
    for (key, feats) in f:
        out = asr.decode(feats)
        print(f"Audio file: {key}\nTrancription:",out["text"])
예제 #22
0
파일: decode.py 프로젝트: hlthu/pykaldi
# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterRecognizer.from_files(
    "exp/tdnn_7b_chain_online/final.mdl",
    "exp/tdnn_7b_chain_online/graph_pp/HCLG.fst",
    "exp/tdnn_7b_chain_online/graph_pp/words.txt",
    decoder_opts=decoder_opts,
    decodable_opts=decodable_opts)

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |")
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |"
    "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |"
)

# Decode wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i, \
     open("out/decode.out", "w") as o:
    for (key, feats), (_, ivectors) in zip(f, i):
        out = asr.decode((feats, ivectors))
        print(key, out["text"], file=o)
예제 #23
0
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct SAD
model = NnetSAD.read_model("final.raw")
post = NnetSAD.read_average_posteriors("post_output.vec")
transform = NnetSAD.make_sad_transform(post)
graph = NnetSAD.make_sad_graph()
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.extra_left_context = 79
decodable_opts.extra_right_context = 21
decodable_opts.extra_left_context_initial = 0
decodable_opts.extra_right_context_final = 0
decodable_opts.frames_per_chunk = 150
decodable_opts.acoustic_scale = 0.3
sad = NnetSAD(model, transform, graph, decodable_opts=decodable_opts)
seg = SegmentationProcessor(target_labels=[2])

# Define feature pipeline as a Kaldi rspecifier
feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |"

# Segment
with SequentialMatrixReader(feats_rspec) as f, open ("segments", "w") as s:
    for key, feats in f:
        out = sad.segment(feats)
        segments, stats = seg.process(out["alignment"])
        seg.write(key, segments, s)
        print("segments:", segments, flush=True)
        print("stats:", stats, flush=True)
print("global stats:", seg.stats, flush=True)