Exemplo n.º 1
0
def gen_post(feat_list, model, skip_frames):
    model.eval(
    )  # Put the model in test mode (the opposite of model.train(), essentially)

    m, v = read_mv(stat_file)
    if m is None or v is None:
        raise Exception("mean or variance vector does not exist")

    with open(feat_list) as f:
        for line in f:
            line = line.strip()
            if len(line) < 1: continue
            print("generating features for file", line)
            io = htk_io.fopen(line)
            utt_feat = io.getall()
            utt_feat -= m  # normalize mean
            utt_feat /= (np.sqrt(v) + eps)  # normalize var
            feat_numpy = org_data(utt_feat, skip_frames)
            feat_tensor = torch.from_numpy(feat_numpy).type(gpu_dtype)
            x = Variable(feat_tensor.type(gpu_dtype), volatile=True)
            input_size_list = [x.size(1)]  # number of time steps
            x = nn.utils.rnn.pack_padded_sequence(x,
                                                  input_size_list,
                                                  batch_first=True)

            out_feat = model(x, input_size_list)

            out_feat_numpy = out_feat.data.cpu().numpy()
            out_feat_numpy = dct(out_feat_numpy, type=2, axis=1,
                                 norm='ortho')[:, 1:numcep + 1]
            out_feat_delta = delta(out_feat_numpy, 2)
            out_feat_ddelta = delta(out_feat_delta, 2)
            out_feat_numpy = np.concatenate(
                (out_feat_numpy, out_feat_delta, out_feat_ddelta), axis=1)
            out_file = line.replace(".fea", ".mfc")
            io = htk_io.fopen(out_file,
                              mode="wb",
                              veclen=out_feat_numpy.shape[1])
            io.writeall(out_feat_numpy)
            print("features saved in %s\n" % out_file)
Exemplo n.º 2
0
    def get_embed(self, wav_file, cfg_file):
        htk_feat_file = wav_file[:-4] + '.htk'
        gpu_dtype = torch.FloatTensor
        if self._HCopy(cfg_file, wav_file) is not None:
            io_src = htk_io.fopen(htk_feat_file)
            utt_feat = io_src.getall()
            num_frms = utt_feat.shape[0]
            utt_feat -= self.global_mean_emr
            utt_feat /= (np.sqrt(self.global_var_emr) + 1e-8)
            utt_feat = torch.FloatTensor(utt_feat[np.newaxis, :, :])
            with torch.no_grad():
                utt_feat = Variable(utt_feat).type(gpu_dtype)

            x = utt_feat.cuda()
            for i in range(len(self.rnns)):
                x = self.rnns[i](x, [num_frms])

            x = self.batch_norm(x, [num_frms])

            x_embed = self.fc(x)
            x_cls = self.cls(x_embed)
            x_prob = nn.Softmax(dim=2)(x_cls)

            x_label = x_prob.max(2)[1].data.cpu().numpy()
            x_prob = np.squeeze(x_prob.data.cpu().numpy())
            x_embed = np.squeeze(x_embed.data.cpu().numpy())

            x_ave = torch.cat(
                [torch.mean(x[0, 0:num_frms, :], dim=0, keepdim=True)], dim=0)

            x_embed_g = self.fc(x_ave)
            x_cls_g = self.cls(x_embed_g)
            x_prob_g = nn.Softmax(dim=1)(x_cls_g)

            x_label_g = x_prob_g.max(1)[1].data.cpu().numpy()
            x_prob_g = np.squeeze(x_prob_g.data.cpu().numpy())
            x_embed_g = np.squeeze(x_embed_g.data.cpu().numpy())

        return x_prob, x_embed, x_prob_g, x_embed_g
Exemplo n.º 3
0
def proc_frame(feat_list, skip_frames=0):

    if not os.path.exists(out_folder_base):
        os.makedirs(out_folder_base)

    m_src, v_src = read_mv(stat_file_src)
    if m_src is None or v_src is None:
        raise Exception(
            "mean or variance vector for the source features does not exist")

    m_tgt, v_tgt = read_mv(stat_file_tgt)
    if m_tgt is None or v_tgt is None:
        raise Exception(
            "mean or variance vector for the target features does not exist")

    utt_count = 0
    chunk_idx = -1
    data_cache = []
    buffer_len = 0

    f = open(feat_list, 'r')

    while True:
        if buffer_len < buffer_seq:
            line = f.readline()
            if line == '':
                print('All utterances processed')
                f.close()
                break

            line = line.strip()
            if len(line) < 1: continue

            line_split = line.split()
            if len(line_split) == 2:
                src_feat_file, tgt_feat_file = line_split
            else:
                raise Exception("target feat file missing")

            io_tgt = htk_io.fopen(tgt_feat_file)
            utt_feat_tgt = io_tgt.getall()
            frm_num_tgt, feat_dim_tgt = utt_feat_tgt.shape
            utt_feat_tgt -= m_tgt  # mean normalization
            utt_feat_tgt /= (np.sqrt(v_tgt) + eps)  # var normalization

            io_src = htk_io.fopen(src_feat_file)
            utt_feat_src = io_src.getall()
            frm_num_src, feat_dim_src = utt_feat_src.shape

            if frm_num_src > frm_num_tgt:
                utt_feat_src = utt_feat_src[:frm_num_tgt]
                print("%d source frames, match to %d target frames" %
                      (frm_num_src, frm_num_tgt))

            utt_feat_src -= m_src  # mean normalization
            utt_feat_src /= (np.sqrt(v_src) + eps)  # var normalization

            if skip_frames > 0:
                utt_feat_src = np.pad(utt_feat_src, ((0, skip_frames), (0, 0)),
                                      mode='edge')  # pad the ending frames
                utt_feat_src = utt_feat_src[skip_frames:, :]

            data_cache.append((utt_feat_src, utt_feat_tgt))  # fill the buffer
            buffer_len += 1

            print("Processed %d of %d frames for file %s" %
                  (utt_feat_src.shape[0], frm_num_src, src_feat_file))
            utt_count += 1
            print(utt_count)
        else:  # output to hard drive
            chunk_idx += 1
            print('Saving data chunk %d...' % chunk_idx)
            out_file = out_folder_base + '/' + str(chunk_idx) + '.h5'
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)

    ###
    while buffer_len > 0:
        chunk_idx += 1
        print('Saving remaining data chunk %d...' % chunk_idx)
        out_file = out_folder_base + '/' + str(chunk_idx) + '.h5'
        if buffer_len > chunk_seq:
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)
        else:
            save_hd5(out_file, data_cache)
            buffer_len = 0
Exemplo n.º 4
0
def proc_frame(feat_list):

    if not os.path.exists(out_folder_base):
        os.makedirs(out_folder_base)

    m_src, v_src = read_mv(stat_file_src)
    if m_src is None or v_src is None:
        raise Exception(
            "mean or variance vector for the source features does not exist")

    m_tgt, v_tgt = read_mv(stat_file_tgt)
    if m_tgt is None or v_tgt is None:
        raise Exception(
            "mean or variance vector for the target features does not exist")

    utt_count = 0
    chunk_idx = -1
    data_cache = []
    buffer_len = 0

    f = open(feat_list, 'r')

    while True:
        if buffer_len < buffer_num_frms:
            line = f.readline()
            if line == '':
                print('All utterances processed')
                f.close()
                break

            line = line.strip()
            if len(line) < 1: continue

            line_split = line.split()
            if len(line_split) == 2:
                src_feat_file, tgt_feat_file = line_split
            else:
                raise Exception("target feat file missing")

            io_tgt = htk_io.fopen(tgt_feat_file)
            utt_feat_tgt = io_tgt.getall()
            frm_num_tgt, feat_dim_tgt = utt_feat_tgt.shape
            utt_feat_tgt -= m_tgt  # mean normalization
            utt_feat_tgt /= (np.sqrt(v_tgt) + eps)  # var normalization

            io_src = htk_io.fopen(src_feat_file)
            utt_feat_src = io_src.getall()
            frm_num_src, feat_dim_src = utt_feat_src.shape

            if frm_num_src > frm_num_tgt:
                print("%d source frames, match to %d target frames" %
                      (frm_num_src, frm_num_tgt))

            utt_feat_src -= m_src  # mean normalization
            utt_feat_src /= (np.sqrt(v_src) + eps)  # var normalization
            utt_feat_src = np.pad(
                utt_feat_src, ((win_size_before, win_size_after), (0, 0)),
                mode='edge')  # pad the starting and ending frames
            start = win_size_before
            end = frm_num_tgt + win_size_before

            count = 0

            for i in range(start, end):  # process one utterance
                count += 1
                block_data = None
                block_data = utt_feat_src[i - win_size_before:i +
                                          win_size_after + 1, :]
                block_data = block_data.T
                block_data = block_data.reshape(1, block_data.shape[0],
                                                block_data.shape[1])
                label = utt_feat_tgt[i - win_size_before]

                data_cache.append((block_data, label))  # fill the buffer
                buffer_len += 1

            if count != frm_num_tgt:
                raise Exception(
                    "The number of processed frames %d should equal the number of frames %d in the target utterance"
                    % (count, frm_num_tgt))
            else:
                print("Processed %d of %d frames for file %s" %
                      (count, frm_num_tgt, src_feat_file))
            utt_count += 1
            print(utt_count)
        else:  # output to hard drive
            chunk_idx += 1
            print('Saving data chunk %d...' % chunk_idx)
            out_file = out_folder_base + '/' + str(chunk_idx) + '.h5'
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)

    ###
    while buffer_len > 0:
        chunk_idx += 1
        print('Saving remaining data chunk %d...' % chunk_idx)
        out_file = out_folder_base + '/' + str(chunk_idx) + '.h5'
        if buffer_len > chunk_num_frms:
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)
        else:
            save_hd5(out_file, data_cache)
            buffer_len = 0
Exemplo n.º 5
0
def proc_seq(mlf_dict, feat_list, out_folder, skip_frames=5):

    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    m, v = read_mv(stat_file)
    if m is None or v is None:
        raise Exception("mean or variance vector does not exist")

    utt_count = 0
    chunk_idx = -1
    data_cache = []
    buffer_len = 0

    f = open(feat_list, 'r')

    while True:
        if buffer_len < buffer_seq:
            line = f.readline()
            if line == '':
                print('All utterances processed')
                f.close()
                break

            line = line.strip()
            if len(line) < 1: continue

            filename_key = line.split('/')[-1]
            label = mlf_dict[filename_key]

            io_src = htk_io.fopen(line)
            utt_feat_src = io_src.getall()
            frm_num_src, feat_dim_src = utt_feat_src.shape

            utt_feat_src -= m  # mean normalization
            utt_feat_src /= (np.sqrt(v) + eps)  # var normalization
            if skip_frames > 0:
                utt_feat_src = np.pad(utt_feat_src, ((0, skip_frames), (0, 0)),
                                      mode='edge')  # pad the ending frames
                utt_feat_src = utt_feat_src[skip_frames:, :]

            data_cache.append((utt_feat_src, label))  # fill the buffer
            buffer_len += 1

            print("Processed %d frames for file %s" %
                  (utt_feat_src.shape[0], filename_key))
            mlf_dict.pop(filename_key)
            utt_count += 1
            print(utt_count)
        else:  # output to hard drive
            chunk_idx += 1
            print('Saving data chunk %d...' % chunk_idx)
            out_file = out_folder + '/' + str(chunk_idx) + '.h5'
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)

    while buffer_len > 0:
        chunk_idx += 1
        print('Saving remaining data chunk %d...' % chunk_idx)
        out_file = out_folder + '/' + str(chunk_idx) + '.h5'
        if buffer_len > chunk_seq:
            data_cache, buffer_len = make_chunk(out_file, data_cache,
                                                buffer_len)
        else:
            save_hd5(out_file, data_cache)
            buffer_len = 0
Exemplo n.º 6
0
def gen_decoded(feat_list, model_path):
    model = set_model_ctc.Layered_RNN(
        rnn_input_size=40,
        nb_layers=layers,
        rnn_hidden_size=hidden_size,
        bidirectional=True if num_dirs == 2 else False,
        batch_norm=True,
        num_classes=61)
    model = model.type(gpu_dtype)
    model.load_state_dict(torch.load(model_path))  # load model params
    model.eval(
    )  # Put the model in test mode (the opposite of model.train(), essentially)

    if decoder_type == 'Greedy':
        labels = create_mapping(mapping_file)
        decoder = ctc_decode.GreedyDecoder_test(
            labels, output='char', space_idx=-1)  # setup greedy decoder
    if decoder_type == 'Beam':
        labels = create_mapping(mapping_file)
        scorer = Scorer()
        decoder = ctc_decode.BeamDecoder_test(
            labels,
            scorer,
            top_paths=1,
            beam_width=200,
            output='char',
            space_idx=-1)  # setup beam decoder without lm
    if decoder_type == 'Beam_LM':
        labels_symbol = '_123456789abcde~-hij,.|{ofg?!+u}[x]@ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        labels_true = create_mapping(mapping_file)
        # need to use the fake symbols here for consistency with the trie
        scorer = KenLMScorer(labels_symbol,
                             kenlm_path,
                             trie_path,
                             blank_index=0,
                             space_index=-1)
        scorer.set_lm_weight(lm_weight)
        scorer.set_word_weight(lm_beta1)
        scorer.set_valid_word_weight(lm_beta2)
        # need to use the true timit label to convert the decoded position indexes back to phone labels
        decoder = ctc_decode.BeamDecoder_test(
            labels_true,
            scorer,
            top_paths=1,
            beam_width=200,
            output='char',
            space_idx=-1)  # setup beam decoder with lm

    m, v = read_mv(stat_file)
    if m is None or v is None:
        raise Exception("mean or variance vector does not exist")

    with open(feat_list) as f:
        with open(out_mlf, 'w') as fw:
            fw.write('#!MLF!#\n')
            for line in f:
                line = line.strip()
                if len(line) < 1: continue
                print("recognizing file %s" % line)
                out_name = '"' + line[:line.rfind('.')] + '.rec' + '"'
                fw.write(out_name + '\n')
                io = htk_io.fopen(line)
                utt_feat = io.getall()
                utt_feat -= m  # normalize mean
                utt_feat /= (np.sqrt(v) + eps)  # normalize var
                feat_numpy = org_data(utt_feat, skip_frames=5)
                feat_tensor = torch.from_numpy(feat_numpy).type(gpu_dtype)
                x = Variable(feat_tensor.type(gpu_dtype), volatile=True)
                input_sizes_list = [x.size(1)]
                x = nn.utils.rnn.pack_padded_sequence(x,
                                                      input_sizes_list,
                                                      batch_first=True)
                probs = model(x, input_sizes_list)
                probs = probs.data.cpu()
                decoded = decoder.decode(probs, input_sizes_list)[0]
                for word in decoded:
                    fw.write(word + '\n')
                fw.write('.\n')
                print(' '.join(decoded))