示例#1
0
    def utt_to_samples(self, args):
        '''
    Utt to samples of (feat_chunk, spk_id, sample_key).
    Will be run in a process pool so restrictions apply.
    '''
        result_queue, utt_info = args
        logging.debug(utt_info)

        # TODO: wrap into a function or something
        utt_key, utt_meta = utt_info

        # Load features and select voiced frames.
        feat_scp = utt_meta['feat']
        feat_mat = kaldiio.load_mat(feat_scp)
        num_frames_feat, feat_dim = feat_mat.shape
        vad_scp = utt_meta['vad']

        if vad_scp:
            vad_mat = kaldiio.load_mat(vad_scp)
            num_frames_vad = vad_mat.shape[0]
            logging.debug('feat_mat: %s, vad_mat: %s' %
                          (str(feat_mat.shape), str(vad_mat.shape)))
            if num_frames_feat != num_frames_vad:
                logging.debug('num_frames_feat != num_frames_vad: %d vs %d' %
                              (num_frames_feat, num_frames_vad))
                return None
            voiced_frames_index = np.where(vad_mat == 1)[0]
            logging.debug('voiced_frames_index: %s' %
                          (str(voiced_frames_index.shape)))
            feat_mat_voiced = feat_mat[voiced_frames_index, :]
        else:
            # If no VAD info was found, the entire utt will be used.
            feat_mat_voiced = feat_mat
        num_frames_voiced = feat_mat_voiced.shape[0]
        logging.debug('feat_mat_voiced: %s' % (str(feat_mat_voiced.shape)))

        spk_id = utt_meta['spkid']

        logging.debug('Chunk size: %d' % (self.chunk_size))

        results = []
        chunk_idx = 0
        if self.add_random_offset:
            random_offset = np.random.randint(0, self.chunk_size)
        else:
            random_offset = 0
        for offset in range(random_offset, num_frames_voiced, self.chunk_size):
            if self.single_chunk:
                available = num_frames_voiced - self.chunk_size
                if available < 0:
                    # No padding.
                    logging.warn('Single chunk mode: available < 0.')
                    break
                offset = random.randint(0, available)
            logging.debug('offset = %d' % (offset))
            feat_chunk = feat_mat_voiced[offset:offset + self.chunk_size, :]
            unpadded_frames = feat_chunk.shape[0]
            if self.pad_chunks and unpadded_frames < self.chunk_size:
                rel_chunk_len = float(unpadded_frames) / self.chunk_size
                if rel_chunk_len < self.drop_short_chunks:
                    continue
                logging.debug('Padding chunk of frames %d ...' %
                              (unpadded_frames))
                padded = np.zeros((self.chunk_size, feat_dim),
                                  dtype=feat_chunk.dtype)
                padded[:unpadded_frames, :] = feat_chunk
                feat_chunk = padded
            feat_chunk = np.expand_dims(feat_chunk, axis=2)  # TODO: not here
            sample_key = '%s_chunk%02d' % (utt_key, chunk_idx)
            sample = (feat_chunk, spk_id, sample_key)
            chunk_idx += 1
            results.append(sample)
            if self.single_chunk:
                break
        if result_queue:
            # queue mode
            result_queue.put(results)
            return None
        # imap mode
        return results
示例#2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--verbose',
                        '-V',
                        default=0,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--in-filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5', 'sound.hdf5', 'sound'],
                        help='Specify the file format for the rspecifier. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--stats-filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5', 'npy'],
                        help='Specify the file format for the rspecifier. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--out-filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5'],
                        help='Specify the file format for the wspecifier. '
                        '"mat" is the matrix format in kaldi')

    parser.add_argument('--norm-means',
                        type=strtobool,
                        default=True,
                        help='Do variance normalization or not.')
    parser.add_argument('--norm-vars',
                        type=strtobool,
                        default=False,
                        help='Do variance normalization or not.')
    parser.add_argument('--reverse',
                        type=strtobool,
                        default=False,
                        help='Do reverse mode or not')
    parser.add_argument('--spk2utt',
                        type=str,
                        help='A text file of speaker to utterance-list map. '
                        '(Don\'t give rspecifier format, such as '
                        '"ark:spk2utt")')
    parser.add_argument('--utt2spk',
                        type=str,
                        help='A text file of utterance to speaker map. '
                        '(Don\'t give rspecifier format, such as '
                        '"ark:utt2spk")')
    parser.add_argument('--write-num-frames',
                        type=str,
                        help='Specify wspecifer for utt2num_frames')
    parser.add_argument('--compress',
                        type=strtobool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')
    parser.add_argument('stats_rspecifier_or_rxfilename',
                        help='Input stats. e.g. ark:stats.ark or stats.mat')
    parser.add_argument('rspecifier',
                        type=str,
                        help='Read specifier id. e.g. ark:some.ark')
    parser.add_argument('wspecifier',
                        type=str,
                        help='Write specifier id. e.g. ark:some.ark')
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if ':' in args.stats_rspecifier_or_rxfilename:
        is_rspcifier = True
        if args.stats_filetype == 'npy':
            stats_filetype = 'hdf5'
        else:
            stats_filetype = args.stats_filetype

        stats_dict = dict(
            FileReaderWrapper(args.stats_rspecifier_or_rxfilename,
                              stats_filetype))
    else:
        is_rspcifier = False
        if args.stats_filetype == 'mat':
            stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
        else:
            stats = numpy.load(args.stats_rspecifier_or_rxfilename)
        stats_dict = {None: stats}

    cmvn = CMVN(stats=stats_dict,
                norm_means=args.norm_means,
                norm_vars=args.norm_vars,
                utt2spk=args.utt2spk,
                spk2utt=args.spk2utt,
                reverse=args.reverse)

    with FileWriterWrapper(
            args.wspecifier,
            filetype=args.out_filetype,
            write_num_frames=args.write_num_frames,
            compress=args.compress,
            compression_method=args.compression_method) as writer:
        for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat
            mat = cmvn(mat, utt if is_rspcifier else None)
            writer[utt] = mat
示例#3
0
    def _get_from_loader(self, filepath, filetype):
        """Return ndarray

        In order to make the fds to be opened only at the first referring,
        the loader are stored in self._loaders

        >>> ndarray = loader.get_from_loader(
        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')

        :param: str filepath:
        :param: str filetype:
        :return:
        :rtype: np.ndarray
        """
        if filetype == "hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = h5py.File(filepath, "r")
                self._loaders[filepath] = loader
            return loader[key][()]
        elif filetype == "sound.hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "sound.hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = SoundHDF5File(filepath, "r", dtype="int16")
                self._loaders[filepath] = loader
            array, rate = loader[key]
            return array
        elif filetype == "sound":
            # e.g.
            #    {"input": [{"feat": "some/path.wav",
            #                "filetype": "sound"},
            # Assume PCM16
            if not self.keep_all_data_on_mem:
                array, _ = soundfile.read(filepath, dtype="int16")
                return array
            if filepath not in self._loaders:
                array, _ = soundfile.read(filepath, dtype="int16")
                self._loaders[filepath] = array
            return self._loaders[filepath]
        elif filetype == "npz":
            # e.g.
            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
            #                "filetype": "npz",
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = np.load(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        elif filetype == "npy":
            # e.g.
            #    {"input": [{"feat": "some/path.npy",
            #                "filetype": "npy"},
            if not self.keep_all_data_on_mem:
                return np.load(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = np.load(filepath)
            return self._loaders[filepath]
        elif filetype in ["mat", "vec"]:
            # e.g.
            #    {"input": [{"feat": "some/path.ark:123",
            #                "filetype": "mat"}]},
            # In this case, "123" indicates the starting points of the matrix
            # load_mat can load both matrix and vector
            if not self.keep_all_data_on_mem:
                return kaldiio.load_mat(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = kaldiio.load_mat(filepath)
            return self._loaders[filepath]
        elif filetype == "scp":
            # e.g.
            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
            #                "filetype": "scp",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = kaldiio.load_scp(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        else:
            raise NotImplementedError("Not supported: loader_type={}".format(filetype))
示例#4
0
    def __getitem__(self, indices):
        """Create mini-batch per step.

        Args:
            indices (np.ndarray): indices of dataframe in the current mini-batch
        Returns:
            mini_batch_dict (dict):
                xs (list): input data of size `[T, input_dim]`
                xlens (list): lengths of xs
                ys (list): reference labels in the main task of size `[L]`
                ys_sub1 (list): reference labels in the 1st auxiliary task of size `[L_sub1]`
                ys_sub2 (list): reference labels in the 2nd auxiliary task of size `[L_sub2]`
                utt_ids (list): name of each utterance
                speakers (list): name of each speaker
                sessions (list): name of each session

        """
        # external alignment
        trigger_points = None
        if self.word_alignment_dir is not None:
            trigger_points = np.zeros(
                (len(indices), max([self.df['ylen'][i] for i in indices]) + 1),
                dtype=np.int32)
            for b, i in enumerate(indices):
                p = self.df['trigger_points'][i]
                trigger_points[b, :len(p)] = p - 1  # 0-indexed
                # NOTE: <eos> is not treated here
            trigger_points //= self.subsample_factor
        elif self.ctc_alignment_dir is not None:
            trigger_points = np.zeros(
                (len(indices), max([self.df['ylen'][i] for i in indices]) + 1),
                dtype=np.int32)
            for b, i in enumerate(indices):
                p = self.df['trigger_points'][i]  # including <eos>
                trigger_points[b, :len(p)] = p  # already 0-indexed

        # inputs
        xs = [kaldiio.load_mat(self.df['feat_path'][i]) for i in indices]
        xlens = [self.df['xlen'][i] for i in indices]
        utt_ids = [self.df['utt_id'][i] for i in indices]
        speakers = [self.df['speaker'][i] for i in indices]
        sessions = [self.df['session'][i] for i in indices]
        texts = [self.df['text'][i] for i in indices]
        feat_paths = [self.df['feat_path'][i] for i in indices]

        # main outputs
        if self.is_test:
            ys = [self._token2idx[0](self.df['text'][i]) for i in indices]
        else:
            ys = [
                list(map(int,
                         str(self.df['token_id'][i]).split())) for i in indices
            ]

        # sub1 outputs
        ys_sub1 = []
        if self.df_sub1 is not None:
            ys_sub1 = [
                list(map(int,
                         str(self.df_sub1['token_id'][i]).split()))
                for i in indices
            ]
        elif self._vocab_sub1 > 0 and not self.is_test:
            ys_sub1 = [self._token2idx[1](self.df['text'][i]) for i in indices]

        # sub2 outputs
        ys_sub2 = []
        if self.df_sub2 is not None:
            ys_sub2 = [
                list(map(int,
                         str(self.df_sub2['token_id'][i]).split()))
                for i in indices
            ]
        elif self._vocab_sub2 > 0 and not self.is_test:
            ys_sub2 = [self._token2idx[2](self.df['text'][i]) for i in indices]

        mini_batch_dict = {
            'xs': xs,
            'xlens': xlens,
            'ys': ys,
            'ys_sub1': ys_sub1,
            'ys_sub2': ys_sub2,
            'utt_ids': utt_ids,
            'speakers': speakers,
            'sessions': sessions,
            'text': texts,
            'feat_path': feat_paths,  # for plot
            'trigger_points': trigger_points,
        }
        return mini_batch_dict
示例#5
0
import pandas as pd
import scipy.linalg

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("lda_mat", help="original lda matrix (kaldi .mat)")
    parser.add_argument("out_lda_mat", help="new lda matrix to write on")
    parser.parse_args()
    args, leftovers = parser.parse_known_args()

    # Check if n is at least one less dim than the original matrix

    #load matrix
    orig_mat = kaldiio.load_mat(args.lda_mat)

    # new_mat = scipy.linalg.orth(orig_mat)
    # new_mat, r = np.linalg.qr(np.transpose(orig_mat), mode="complete")
    # q,r=np.linalg.qr(np.transpose(orig_mat))
    # new_mat=q.T

    #B <- t(qr.Q(qr(A),complete=TRUE)[,5:10])
    # is same as
    #q,r = np.linalg.qr(a)
    #mat=q.T

    #MAYBE need to transpose other way round (before calculating?) OR NOT TRANSPOSE the second time?

    a = orig_mat.T
    q, r = np.linalg.qr(a, mode='complete')
示例#6
0
    def __init__(self,
                 corpus,
                 tsv_path,
                 dict_path,
                 unit,
                 nlsyms,
                 wp_model,
                 is_test,
                 min_n_frames,
                 max_n_frames,
                 sort_by,
                 short2long,
                 tsv_path_sub1,
                 tsv_path_sub2,
                 ctc,
                 ctc_sub1,
                 ctc_sub2,
                 subsample_factor,
                 subsample_factor_sub1,
                 subsample_factor_sub2,
                 dict_path_sub1,
                 dict_path_sub2,
                 unit_sub1,
                 unit_sub2,
                 wp_model_sub1,
                 wp_model_sub2,
                 discourse_aware=False,
                 first_n_utterances=-1,
                 word_alignment_dir=None,
                 ctc_alignment_dir=None):
        """Custom Dataset class.

        Args:
            corpus (str): name of corpus
            tsv_path (str): path to the dataset tsv file
            dict_path (str): path to the dictionary
            unit (str): word/wp/char/phone/word_char
            nlsyms (str): path to the non-linguistic symbols file
            wp_model (): path to the word-piece model for sentencepiece
            is_test (bool):
            min_n_frames (int): exclude utterances shorter than this value
            max_n_frames (int): exclude utterances longer than this value
            sort_by (str): sort all utterances in the ascending order
                input: sort by input length
                output: sort by output length
                shuffle: shuffle all utterances
            short2long (bool): sort utterances in the descending order
            ctc (bool):
            subsample_factor (int):
            discourse_aware (bool): sort in the discourse order
            first_n_utterances (int): evaluate the first N utterances
            word_alignment_dir (str): path to word alignment directory
            ctc_alignment_dir (str): path to CTC alignment directory

        """
        super(Dataset, self).__init__()

        self.epoch = 0

        # meta deta accessed by dataloader
        self._corpus = corpus
        self._set = os.path.basename(tsv_path).split('.')[0]
        self._vocab = count_vocab_size(dict_path)
        self._unit = unit
        self._unit_sub1 = unit_sub1
        self._unit_sub2 = unit_sub2

        self.is_test = is_test
        self.sort_by = sort_by
        assert sort_by in ['input', 'output', 'shuffle', 'utt_id']
        # if shuffle_bucket:
        #     assert sort_by in ['input', 'output']
        if discourse_aware:
            assert not is_test

        self.subsample_factor = subsample_factor
        self.word_alignment_dir = word_alignment_dir
        self.ctc_alignment_dir = ctc_alignment_dir

        self._idx2token = []
        self._token2idx = []

        # Set index converter
        if unit in ['word', 'word_char']:
            self._idx2token += [Idx2word(dict_path)]
            self._token2idx += [
                Word2idx(dict_path, word_char_mix=(unit == 'word_char'))
            ]
        elif unit == 'wp':
            self._idx2token += [Idx2wp(dict_path, wp_model)]
            self._token2idx += [Wp2idx(dict_path, wp_model)]
        elif unit in ['char']:
            self._idx2token += [Idx2char(dict_path)]
            self._token2idx += [Char2idx(dict_path, nlsyms=nlsyms)]
        elif 'phone' in unit:
            self._idx2token += [Idx2phone(dict_path)]
            self._token2idx += [Phone2idx(dict_path)]
        else:
            raise ValueError(unit)

        for i in range(1, 3):
            dict_path_sub = locals()['dict_path_sub' + str(i)]
            wp_model_sub = locals()['wp_model_sub' + str(i)]
            unit_sub = locals()['unit_sub' + str(i)]
            if dict_path_sub:
                setattr(self, '_vocab_sub' + str(i),
                        count_vocab_size(dict_path_sub))

                # Set index converter
                if unit_sub:
                    if unit_sub == 'wp':
                        self._idx2token += [
                            Idx2wp(dict_path_sub, wp_model_sub)
                        ]
                        self._token2idx += [
                            Wp2idx(dict_path_sub, wp_model_sub)
                        ]
                    elif unit_sub == 'char':
                        self._idx2token += [Idx2char(dict_path_sub)]
                        self._token2idx += [
                            Char2idx(dict_path_sub, nlsyms=nlsyms)
                        ]
                    elif 'phone' in unit_sub:
                        self._idx2token += [Idx2phone(dict_path_sub)]
                        self._token2idx += [Phone2idx(dict_path_sub)]
                    else:
                        raise ValueError(unit_sub)
            else:
                setattr(self, '_vocab_sub' + str(i), -1)

        # Load dataset tsv file
        df = pd.read_csv(tsv_path, encoding='utf-8', delimiter='\t')
        df = df.loc[:, [
            'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text',
            'token_id', 'ylen', 'ydim'
        ]]
        for i in range(1, 3):
            if locals()['tsv_path_sub' + str(i)]:
                df_sub = pd.read_csv(locals()['tsv_path_sub' + str(i)],
                                     encoding='utf-8',
                                     delimiter='\t')
                df_sub = df_sub.loc[:, [
                    'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text',
                    'token_id', 'ylen', 'ydim'
                ]]
                setattr(self, 'df_sub' + str(i), df_sub)
            else:
                setattr(self, 'df_sub' + str(i), None)
        self._input_dim = kaldiio.load_mat(df['feat_path'][0]).shape[-1]

        # Remove inappropriate utterances
        print('Original utterance num: %d' % len(df))
        n_utts = len(df)
        if is_test or discourse_aware:
            df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)]
            print('Removed %d empty utterances' % (n_utts - len(df)))
            if first_n_utterances > 0:
                n_utts = len(df)
                df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)]
                df = df.truncate(before=0, after=first_n_utterances - 1)
                print('Select first %d utterances' % len(df))
        else:
            df = df[df.apply(
                lambda x: min_n_frames <= x['xlen'] <= max_n_frames, axis=1)]
            df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)]
            print('Removed %d utterances (threshold)' % (n_utts - len(df)))

            if ctc and subsample_factor > 1:
                n_utts = len(df)
                df = df[df.apply(lambda x: x['ylen'] <=
                                 (x['xlen'] // subsample_factor),
                                 axis=1)]
                print('Removed %d utterances (for CTC)' % (n_utts - len(df)))

            for i in range(1, 3):
                df_sub = getattr(self, 'df_sub' + str(i))
                ctc_sub = locals()['ctc_sub' + str(i)]
                subsample_factor_sub = locals()['subsample_factor_sub' +
                                                str(i)]
                if df_sub is not None:
                    if ctc_sub and subsample_factor_sub > 1:
                        df_sub = df_sub[df_sub.apply(
                            lambda x: x['ylen'] <=
                            (x['xlen'] // subsample_factor_sub),
                            axis=1)]

                    if len(df) != len(df_sub):
                        n_utts = len(df)
                        df = df.drop(df.index.difference(df_sub.index))
                        print('Removed %d utterances (for CTC, sub%d)' %
                              (n_utts - len(df), i))
                        for j in range(1, i + 1):
                            setattr(
                                self, 'df_sub' + str(j),
                                getattr(self, 'df_sub' + str(j)).drop(
                                    getattr(self, 'df_sub' +
                                            str(j)).index.difference(
                                                df.index)))

        if corpus == 'swbd':
            # 1. serialize
            # df['session'] = df['speaker'].apply(lambda x: str(x).split('-')[0])
            # 2. not serialize
            df['session'] = df['speaker'].apply(lambda x: str(x))
        else:
            df['session'] = df['speaker'].apply(lambda x: str(x))

        # Sort tsv records
        if discourse_aware:
            # Sort by onset (start time)
            df = df.assign(prev_utt='')
            df = df.assign(line_no=list(range(len(df))))
            if corpus == 'swbd':
                df['onset'] = df['utt_id'].apply(
                    lambda x: int(x.split('_')[-1].split('-')[0]))
            elif corpus == 'csj':
                df['onset'] = df['utt_id'].apply(
                    lambda x: int(x.split('_')[1]))
            elif corpus == 'tedlium2':
                df['onset'] = df['utt_id'].apply(
                    lambda x: int(x.split('-')[-2]))
            else:
                raise NotImplementedError(corpus)
            df = df.sort_values(by=['session', 'onset'], ascending=True)

            # Extract previous utterances
            groups = df.groupby('session').groups
            df['prev_utt'] = df.apply(lambda x: [
                df.loc[i, 'line_no'] for i in groups[x['session']]
                if df.loc[i, 'onset'] < x['onset']
            ],
                                      axis=1)
            df['n_prev_utt'] = df.apply(lambda x: len(x['prev_utt']), axis=1)
            df['n_utt_in_session'] = df.apply(
                lambda x: len([i for i in groups[x['session']]]), axis=1)
            df = df.sort_values(by=['n_utt_in_session'], ascending=short2long)

            # NOTE: this is used only when LM is trained with serialize: true
            # if is_test and corpus == 'swbd':
            #     # Sort by onset
            #     df['onset'] = df['utt_id'].apply(lambda x: int(x.split('_')[-1].split('-')[0]))
            #     df = df.sort_values(by=['session', 'onset'], ascending=True)

        elif not is_test:
            if sort_by == 'input':
                df = df.sort_values(by=['xlen'], ascending=short2long)
            elif sort_by == 'output':
                df = df.sort_values(by=['ylen'], ascending=short2long)
            elif sort_by == 'shuffle':
                df = df.reindex(np.random.permutation(self.df.index))

        # Fit word alignment to vocabulary
        if word_alignment_dir is not None:
            alignment2boundary = WordAlignmentConverter(dict_path, wp_model)
            n_utts = len(df)
            df['trigger_points'] = df.apply(lambda x: alignment2boundary(
                word_alignment_dir, x['speaker'], x['utt_id'], x['text']),
                                            axis=1)
            # remove utterances which do not have the alignment
            df = df[df.apply(lambda x: x['trigger_points'] is not None,
                             axis=1)]
            print('Removed %d utterances (for word alignment)' %
                  (n_utts - len(df)))
        elif ctc_alignment_dir is not None:
            n_utts = len(df)
            df['trigger_points'] = df.apply(lambda x: load_ctc_alignment(
                ctc_alignment_dir, x['speaker'], x['utt_id']),
                                            axis=1)
            # remove utterances which do not have the alignment
            df = df[df.apply(lambda x: x['trigger_points'] is not None,
                             axis=1)]
            print('Removed %d utterances (for CTC alignment)' %
                  (n_utts - len(df)))

        # Re-indexing
        if discourse_aware:
            self.df = df
            for i in range(1, 3):
                if getattr(self, 'df_sub' + str(i)) is not None:
                    setattr(self, 'df_sub' + str(i),
                            getattr(self, 'df_sub' + str(i)).reindex(df.index))
        else:
            self.df = df.reset_index()
            for i in range(1, 3):
                if getattr(self, 'df_sub' + str(i)) is not None:
                    setattr(
                        self, 'df_sub' + str(i),
                        getattr(self, 'df_sub' + str(i)).reindex(
                            df.index).reset_index())
示例#7
0
# source vectorizer ...
target_speech_df = pd.read_csv(config_args['target_speech_metadata'],
                               encoding='utf-8')

target_label_set = config_args['target_language_set'].split()

# make sure no utterances with 0 duration such as
#target_speech_df = target_speech_df[(target_speech_df.duration!=0)]

target_speech_df = target_speech_df[(
    target_speech_df['language'].isin(target_label_set))]

len(target_speech_df), target_label_set

cmvn_stats = kaldiio.load_mat(config_args['source_cmvn'])
mean_stats = cmvn_stats[0, :-1]
count = cmvn_stats[0, -1]
offset = np.expand_dims(mean_stats, 0) / count

source_speech_vectorizer = LID_Vectorizer(
    data_dir=config_args['source_data_dir'],
    speech_df=source_speech_df,
    feature_type=config_args['input_signal_params']['feature_type'],
    label_set=config_args['source_language_set'].split(),
    max_num_frames=config_args['input_signal_params']['max_num_frames'],
    num_frames=config_args['input_signal_params']['num_frames'],
    feature_dim=config_args['model_arch']['feature_dim'],
    start_idx=config_args['input_signal_params']['start_index'],
    end_idx=config_args['input_signal_params']['end_index'],
    cmvn=offset)
示例#8
0
def test_write_read_mat(tmpdir, endian, dtype):
    path = tmpdir.mkdir('test')
    valid = np.random.rand(1000, 120).astype(dtype)
    kaldiio.save_mat(path.join('a.mat').strpath, valid, endian=endian)
    test = kaldiio.load_mat(path.join('a.mat').strpath, endian=endian)
    np.testing.assert_array_equal(test, valid)
示例#9
0
    path_of_scores = os.path.join(nnet_path, 'scores')
    createdir(path_of_scores, override=False, append=True)
    shutil.copyfile(__file__, os.path.join(nnet_path, 'train.py'))
    shutil.copyfile(args.config_path, os.path.join(nnet_path, 'config.json'))
    shutil.copytree(utils_path, os.path.join(exp_model_path, 'utils'))
    shutil.copytree(score_path, os.path.join(exp_model_path, 'score'))
    shutil.copytree(dataset_path, os.path.join(exp_model_path, 'dataset'))
    recorder = SummaryWriter(exp_model_path)
    log_path = os.path.join(exp_model_path, 'log')

    if params.net_type == 'basic':
        model = BasicNet(in_features=params.feature_dim, out_features=1, input_process=params.input_process)
    elif params.net_type == 'cnn_net':
        model = CnnNet(in_channels=2)
    elif params.net_type == 'kaldi_lda':
        model = LdaNet(load_mat(params.kaldi_transform_path), frozen=params.frozen, contains_bias=params.contains_bias, 
                       mid_feature=2048 if 'mid_feature' not in params.dict else params.mid_feature,
                       hidden_layers=1 if 'hidden_layers' not in params.dict else params.hidden_layers,
                       input_process='out_add_vec_cat' if 'input_process' not in params.dict else params.input_process)
    elif params.net_type == 'kaldi_cnn':
        model = KaldiReductionCNN(load_mat(params.kaldi_transform_path), frozen=params.frozen, contains_bias=params.contains_bias)
    print2file(model)
    if params.use_gpu:
        model = model.cuda()
    batch_size = params.batch_size
    print2file('Dealing with data.')
    train_sl = SiameseSet(
        scp_path=params.train_path,
        utt_per_spk=None if 'utt_per_spk' not in params.dict else params.utt_per_spk,
        pre_load=False if 'pre_load' not in params.dict else params.pre_load,
        strategy=params.strategy,
示例#10
0
import json
import matplotlib.pyplot as plt
import kaldiio

root = "/home/nlp/ASR/espnet/egs/FSW"
with open(root + "/dump/test/deltafalse/data.json", "r") as f:
    test_json = json.load(f)["utts"]

key, info = list(test_json.items())[10]
fbank = kaldiio.load_mat(info["input"][0]["feat"])

# plot the speech feature
plt.matshow(fbank.T[::-1])
plt.title(key + ": " + info["output"][0]["text"])