예제 #1
0
    def forward(ctx, loglikes, den_graph, supervision, chain_opts):

        loglikes = kaldi_matrix.Matrix(loglikes.detach().cpu().numpy())

        #if kaldi_cudamatrix.cuda_available():
        #    from kaldi.cudamatrix import CuDevice
        #    CuDevice.instantiate().select_gpu_id('yes')
        #     CuDevice.instantiate().allow_multithreading()
        nnet_out = kaldi_cudamatrix.CuMatrix().from_matrix(loglikes)
        grad = kaldi_cudamatrix.CuMatrix().from_size(nnet_out.num_rows(),
                                                     nnet_out.num_cols())
        grad_xent = kaldi_cudamatrix.CuMatrix().from_size(
            nnet_out.num_rows(), nnet_out.num_cols())

        loss = kaldi_chain.compute_chain_objf_and_deriv(
            chain_opts, den_graph, supervision, nnet_out, grad, grad_xent)

        grad.add_mat(chain_opts.xent_regularize, grad_xent)
        grad_out = kaldi_matrix.Matrix(nnet_out.num_rows(),
                                       nnet_out.num_cols())
        grad.copy_to_mat(grad_out)

        ctx.save_for_backward(th.from_numpy(grad_out.numpy()).cuda())

        return th.tensor(loss[0])
예제 #2
0
    def forward(ctx, loglikes_T, loglikes_S, asr_decoder):

        # We can use either ther teacher model or the student model to generate the lattice
        decode_out = asr_decoder.decode(
            kaldi_matrix.Matrix(loglikes_T.detach().cpu().numpy()))
        lattice = decoder_out["lattice"]
        kaldi_lat.functions.top_sort_lattice_if_needed(lattice)
        lat_like_T, post_T, acoustic_like_T = kaldi_lat.functions.lattice_forward_backward(
            lattice)

        decodable = kaldi_decoder.DecodableMatrixScaled(loglikes_S, 1.0)
        if kaldi_lat.functions.rescore_lattice(decodable, lattice):
            lat_like_S, post_S, acoustic_like_S = kaldi_lat.functions.lattice_forward_backward(
                lattice)
        else:
            sys.stderr.write('ERROR: Rescore lattice failed!')
            sys.exit(0)

        post_T = kaldi_hmm.Posterior.from_posteriors(post_T)
        post_S = kaldi_hmm.Posterior.from_posteriors(post_S)
        post_mat = post_T.to_pdf_matrix(
            trans_model).numpy() - post_S.to_pdf_matrix(trans_model).numpy()

        ctx.save_for_backward(th.from_numpy(post_mat).cuda())

        loss = F.cross_entropy(th.from_numpy(post_T), th.from_nompy(post_S))

        return loss.item()
예제 #3
0
    def forward(ctx, loglikes, den_graph, supervision, chain_opts):

        loglikes = kaldi_matrix.Matrix(loglikes.detach().cpu().numpy())
        nnet_out = kaldi_cudamatrix.CuMatrix().from_matrix(loglikes)
        grad = kaldi_cudamatrix.CuMatrix().from_size(nnet_out.num_rows(),
                                                     nnet_out.num_cols())
        grad_xent = kaldi_cudamatrix.CuMatrix().from_size(
            nnet_out.num_rows(), nnet_out.num_cols())

        loss = kaldi_chain.compute_chain_objf_and_deriv(
            chain_opts, den_graph, supervision, nnet_out, grad, grad_xent)

        grad.add_mat(chain_opts.xent_regularize, grad_xent)
        grad_out = kaldi_matrix.Matrix(nnet_out.num_rows(),
                                       nnet_out.num_cols())
        grad.copy_to_mat(grad_out)

        ctx.save_for_backward(th.from_numpy(grad_out.numpy()).cuda())

        return th.tensor(loss[0])
예제 #4
0
파일: ops.py 프로젝트: xdcesc/pykaldi2
    def forward(ctx, loglikes, asr_decoder, trans_model, trans_ids):

        decode_out = asr_decoder.decode(kaldi_matrix.Matrix(loglikes.detach().cpu().numpy()))
        lattice = decode_out["lattice"]
        kaldi_lat.functions.top_sort_lattice_if_needed(lattice)
        lat_like, post = kaldi_lat.functions.lattice_forward_backward_mmi(trans_model, lattice, trans_ids, True, False, True)
        post = kaldi_hmm.Posterior.from_posteriors(post)
        post_mat = post.to_pdf_matrix(trans_model).numpy()
 
        ctx.save_for_backward(th.from_numpy(post_mat).cuda())
        #print(post_mat)
        return th.tensor(lat_like)
예제 #5
0
파일: table.py 프로젝트: taomanwai/pykaldi
    def write(self, key, value):
        """Writes the `(key, value)` pair to the table.

        This method is provided for compatibility with the C++ API only;
        most users should use the Pythonic API.

        Overrides write to accept both Matrix and SubMatrix.

        Args:
            key (str): The key.
            value: The value.
        """
        super(MatrixWriter, self).write(key, _matrix.Matrix(value))
예제 #6
0
    def forward(ctx, loglikes, asr_decoder, trans_model, supervision, config):

        decode_out = asr_decoder.decode(
            kaldi_matrix.Matrix(loglikes.detach().cpu().numpy()))
        lattice = decode_out["lattice"]
        kaldi_lat.functions.top_sort_lattice_if_needed(lattice)
        scale = kaldi_fst.utils.lattice_scale(config['lm_weight'],
                                              config['am_weight'])
        kaldi_fst.utils.scale_lattice(scale, lattice)
        lat_like, post, acoustic_like = kaldi_lat.functions.lattice_forward_backward(
            lattice)

        if config['phone_level']:
            kaldi_lat.functions.convert_lattice_to_phones(trans_model, lattice)
            _, supervision_phones = kaldi_hmm.split_to_phones(
                trans_model, supervision)
            supervision = [
                trans_model.transition_id_to_phone(cluster[0])
                for cluster in supervision_phones
            ]

        #lattice = kaldi_fst.utils.convert_compact_lattice_to_lattice(lattice)
        ifst = kaldi_fst.utils.convert_lattice_to_std(lattice)

        length = loglikes.size(0)
        ilabels = list()
        olabels = list()
        edit_distance = list()
        weights = list()
        if config['rand_path']:
            #ofst = kaldi_fst.randgen(fst, npath=8, seed=None, select='uniform', max_length=length, weighted=True, remove_total_weight=False)
            #for i in range(config['num_paths']):
            n = 0
            while len(olabels) < config['num_paths'] and n < 500:
                n += 1
                ofst = kaldi_fst.StdVectorFst()
                randint = np.random.randint(0, 10000)
                if (kaldi_fst.utils.equal_align(ifst, length, randint, ofst)):
                    ilabel, olabel, weight = kaldi_fst.utils.get_linear_symbol_sequence(
                        ofst)
                    if olabel not in olabels:
                        olabels.append(olabel)
                        ilabels.append(ilabel)
                        weights.append(weight.value)
                        edit_distance.append(
                            editdistance.eval(olabel, supervision))
        else:
            nbest_lats = kaldi_fst.utils.nbest_as_fsts(ifst,
                                                       config['num_paths'])
            for path in nbest_lats:
                ilabel, olabel, weight = kaldi_fst.utils.get_linear_symbol_sequence(
                    path)
                if olabel not in olabels:
                    olabels.append(olabel)
                    ilabels.append(ilabel)
                    weights.append(weight.value)
                    edit_distance.append(editdistance.eval(
                        olabel, supervision))

        if (config['equal_weight']):
            normalizer = th.ones(len(weights),
                                 dtype=th.float32) * 1.0 / len(weights)
        else:
            normalizer = F.softmax(th.FloatTensor(weights) * -1, dim=0)

        mean_err = th.FloatTensor(edit_distance) * normalizer
        loss = mean_err.sum()
        grad_value = (th.FloatTensor(edit_distance) - loss) * normalizer
        # compute gradients
        grad_out = th.zeros(loglikes.size())

        for idx in range(len(ilabels)):
            ilabel = ilabels[idx]
            for i in range(len(ilabel)):
                pdf_id = trans_model.transition_id_to_pdf(ilabel[i])
                grad_out[i][pdf_id] += grad_value[idx]

        ctx.save_for_backward(grad_out.cuda())

        return loss
예제 #7
0
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch,
                    asr_decoder, trans_model, silence_ids, aligner, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')
    if args.criterion == "mmi":
        criterion = ops.MMIFunction.apply
    else:
        criterion = ops.sMBRFunction.apply

    end = time.time()
    for i, batch in enumerate(dataloader):
        feat = batch["x"]
        label = batch["y"]
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #word labels for se loss

        x = feat.to(th.float32)
        y = label.long()
        x = x.cuda()
        y = y.cuda()

        prediction = model(x)
        ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]),
                               y.view(-1))
        loss = args.ce_ratio * ce_loss

        for j in range(len(num_frs)):
            loglike = prediction[j, :, :]
            loglike_j = loglike[:num_frs[j], :]
            loglike_j = loglike_j - log_prior

            text = th.from_numpy(aux[j][0][0].astype(int)).tolist()
            #text = ' '.join(str(k) for k in text)
            try:
                align_in = kaldi_matrix.Matrix(
                    loglike_j.detach().cpu().numpy())
                align_out = aligner.align(align_in, text)
                trans_ids = align_out["alignment"]

                if args.criterion == "mmi":
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids)
                else:
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids, args.criterion, silence_ids)
                loss += se_loss.cuda()
            except:
                print(
                    "Warning: failed to align utterance {}, skip the utterance for SE loss"
                    .format(utt_ids[j]))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/model.se.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data_path")
    parser.add_argument("-prior_path",
                        help="the path to load the final.occs file")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument("-graph_dir", help="the decoding graph directory")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data_path

    config["source_paths"].append(data_config)

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    #data = trainset.__getitem__(0)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    HCLG = args.graph_dir + "/HCLG.fst"
    words_txt = args.graph_dir + "/words.txt"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    prior = read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = True  #To produce compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    model.eval()
    with th.no_grad():
        with kaldi_util.table.CompactLatticeWriter("ark:" +
                                                   args.out_file) as lat_out:
            for data in test_dataloader:
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                x = x.cuda()

                prediction = model(x)

                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()

                    loglikes_j = loglikes[:num_frs[j], :]
                    loglikes_j = loglikes_j - log_prior

                    decoder_out = asr_decoder.decode(
                        kaldi_matrix.Matrix(loglikes_j.numpy()))

                    key = utt_ids[j][0]
                    print(key, decoder_out["text"])

                    print("Log-like per-frame for utterance {} is {}".format(
                        key, decoder_out["likelihood"] / num_frs[j]))

                    # save lattice
                    lat_out[key] = decoder_out["lattice"]