示例#1
0
 def gen_train(self):
     for ele in self.train_list:
         feats = kaldi_io.read_mat(ele[0])
         gender = kaldi_io.read_vec_int(ele[1])
         yield feats, gender
     self.epoch += 1
     if self.epoch >= self.start_shuffle_epoch:
         self.shuffle()
示例#2
0
def gen():
    global train_set
    for ele in train_set:
        feats = kaldi_io.read_mat(ele[0])
        gender = kaldi_io.read_vec_int(ele[1])
        yield feats, gender
    chunks = [sort_set[i:i + batch] for i in range(0, len(sort_set), batch)]
    random.shuffle(chunks)
    train_set = []
    for i in range(len(chunks)):
        train_set.extend(chunks[i])
def main_():
	ap = argparse.ArgumentParser(usage="")

	# positional args
	ap.add_argument("--floor_value", default=0, type=float)
	ap.add_argument("uv_rspecifier", help="input uv")
	ap.add_argument("feature_rspecifier", help="input feat")
	ap.add_argument("feature_wspecifier", help="output feat")
	
	args = ap.parse_args()
	thresh = 0.5
	floor = args.floor_value
	with kaldi_io.open_or_fd(args.uv_rspecifier, "rb") as uv_reader:
		with kaldi_io.open_or_fd(args.feature_wspecifier, "wb") as feature_writer:
			for key, feats in kaldi_io.read_mat_ark(args.feature_rspecifier):
				assert (key == kaldi_io.read_key(uv_reader))
				uv = kaldi_io.read_mat(uv_reader)

				new_feats = np.where(uv>=thresh, feats, floor)
				kaldi_io.write_mat(feature_writer, new_feats, key=key)
def main_():
    ap = argparse.ArgumentParser(usage="")

    # positional args
    ap.add_argument("uv_rspecifier", help="input uv")
    ap.add_argument("feature_rspecifier", help="input feat")
    ap.add_argument("feature_wspecifier", help="output feat")

    args = ap.parse_args()
    thresh = 0.5
    with kaldi_io.open_or_fd(args.uv_rspecifier, "rb") as uv_reader:
        with kaldi_io.open_or_fd(args.feature_wspecifier,
                                 "wb") as feature_writer:
            for key, feats in kaldi_io.read_mat_ark(args.feature_rspecifier):
                assert (key == kaldi_io.read_key(uv_reader))
                # import pdb
                # pdb.set_trace()
                uv = kaldi_io.read_mat(uv_reader)
                idx = np.where(uv < thresh)
                new_feats = np.delete(feats, idx[0], axis=0)
                if new_feats.shape[0] < 1:
                    continue
                kaldi_io.write_mat(feature_writer, new_feats, key=key)
                del uv, idx, new_feats
示例#5
0
 def gen_valid(self):
     for ele in self.valid_list:
         feats = kaldi_io.read_mat(ele[0])
         gender = kaldi_io.read_vec_int(ele[1])
         yield feats, gender
示例#6
0
 def gen_eval(self):
     for ele in self.data_list:
         feats = kaldi_io.read_mat(ele[0])
         frames = ele[1]
         yield feats, frames
示例#7
0
from utils.kaldi_io import read_mat

decay = 0.9999
data_type = ['tr', 'cv', 'tt']
feats_dim = 129
dir_pattern = "/mnt/hd8t/fsl/SpeechSeparation/mix/data/2speakers/wav8k/min/{}/record"

def moving_average(mean_var, variance_var, value, decay):
    variance_var = decay * (variance_var + (1 - decay) * (value - mean_var) ** 2)
    mean_var = decay * mean_var + (1 - decay) * value
    return mean_var, variance_var

for i_type in data_type:
    dir_path = dir_pattern.format(i_type)
    cmvn_path = os.path.join(dir_path, 'global.cmvn.log')
    feats_path = os.path.join(dir_path, 'feats.scp')
    mean_var = np.zeros(feats_dim)
    variance_var = np.ones(feats_dim)
    with open(feats_path, 'r') as f:
        for line in f.readlines():
            line = line.strip().split()
            feats = read_mat(line[1])
            frames = int(len(feats) / 3)
            # use log(1+x)
            log_feats_input = np.log(feats[0:frames, 0:feats_dim] + 1)
            for item in log_feats_input:
                mean_var, variance_var = moving_average(mean_var, variance_var,
                                                        item, decay)
    cmvn = np.stack([mean_var, variance_var], axis=1)
    np.savetxt(cmvn_path, cmvn)
示例#8
0
    def __init__(self,
                 tsv_path,
                 dict_path,
                 unit,
                 batch_size,
                 nlsyms=False,
                 n_epochs=None,
                 is_test=False,
                 min_n_frames=40,
                 max_n_frames=2000,
                 shuffle=False,
                 sort_by_input_length=False,
                 short2long=False,
                 sort_stop_epoch=None,
                 n_ques=None,
                 dynamic_batching=False,
                 ctc=False,
                 subsample_factor=1,
                 wp_model=False,
                 corpus='',
                 concat_prev_n_utterances=0,
                 n_caches=0,
                 tsv_path_sub1=False,
                 dict_path_sub1=False,
                 unit_sub1=False,
                 wp_model_sub1=False,
                 ctc_sub1=False,
                 subsample_factor_sub1=1,
                 wp_model_sub2=False,
                 tsv_path_sub2=False,
                 dict_path_sub2=False,
                 unit_sub2=False,
                 ctc_sub2=False,
                 subsample_factor_sub2=1,
                 wp_model_sub3=False,
                 tsv_path_sub3=False,
                 dict_path_sub3=False,
                 unit_sub3=False,
                 ctc_sub3=False,
                 subsample_factor_sub3=1):
        """A class for loading dataset.

        Args:
            tsv_path (str): path to the dataset tsv file
            dict_path (str): path to the dictionary
            unit (str): word or wp or char or phone or word_char
            batch_size (int): size of mini-batch
            nlsyms (str): path to the non-linguistic symbols file
            n_epochs (int): max epoch. None means infinite loop.
            is_test (bool):
            min_n_frames (int): exclude utterances shorter than this value
            max_n_frames (int): exclude utterances longer than this value
            shuffle (bool): shuffle utterances.
                This is disabled when sort_by_input_length is True.
            sort_by_input_length (bool): sort all utterances in the ascending order
            short2long (bool): sort utterances in the descending order
            sort_stop_epoch (int): After sort_stop_epoch, training will revert
                back to a random order
            n_ques (int): number of elements to enqueue
            dynamic_batching (bool): change batch size dynamically in training
            ctc (bool):
            subsample_factor (int):
            wp_model (): path to the word-piece model for sentencepiece
            corpus (str): name of corpus
            concat_prev_n_utterances (int): number of utterances to concatenate
            n_caches (int): number of previous tokens for cache (for training)

        """
        super(Dataset, self).__init__()

        self.set = os.path.basename(tsv_path).split('.')[0]
        self.is_test = is_test
        self.unit = unit
        self.unit_sub1 = unit_sub1
        self.batch_size = batch_size
        self.max_epoch = n_epochs
        self.shuffle = shuffle
        self.sort_stop_epoch = sort_stop_epoch
        self.sort_by_input_length = sort_by_input_length
        self.n_ques = n_ques
        self.dynamic_batching = dynamic_batching
        self.corpus = corpus
        self.concat_prev_n_utterances = concat_prev_n_utterances
        self.n_caches = n_caches
        self.vocab = self.count_vocab_size(dict_path)

        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        self.idx2token = []
        self.token2idx = []

        # Set index converter
        if unit in ['word', 'word_char']:
            self.idx2token += [Idx2word(dict_path)]
            self.token2idx += [
                Word2idx(dict_path, word_char_mix=(unit == 'word_char'))
            ]
        elif unit == 'wp':
            self.idx2token += [Idx2wp(dict_path, wp_model)]
            self.token2idx += [Wp2idx(dict_path, wp_model)]
        elif unit == 'char':
            self.idx2token += [Idx2char(dict_path)]
            self.token2idx += [Char2idx(dict_path, nlsyms=nlsyms)]
        elif 'phone' in unit:
            self.idx2token += [Idx2phone(dict_path)]
            self.token2idx += [Phone2idx(dict_path)]
        else:
            raise ValueError(unit)

        for i in range(1, 4):
            dict_path_sub = locals()['dict_path_sub' + str(i)]
            wp_model_sub = locals()['wp_model_sub' + str(i)]
            unit_sub = locals()['unit_sub' + str(i)]
            if dict_path_sub:
                setattr(self, 'vocab_sub' + str(i),
                        self.count_vocab_size(dict_path_sub))

                # Set index converter
                if unit_sub:
                    if unit_sub == 'wp':
                        self.idx2token += [Idx2wp(dict_path_sub, wp_model_sub)]
                        self.token2idx += [Wp2idx(dict_path_sub, wp_model_sub)]
                    elif unit_sub == 'char':
                        self.idx2token += [Idx2char(dict_path_sub)]
                        self.token2idx += [
                            Char2idx(dict_path_sub, nlsyms=nlsyms)
                        ]
                    elif 'phone' in unit_sub:
                        self.idx2token += [Idx2phone(dict_path_sub)]
                        self.token2idx += [Phone2idx(dict_path_sub)]
                    else:
                        raise ValueError(unit_sub)
            else:
                setattr(self, 'vocab_sub' + str(i), -1)

        # Load dataset tsv file
        self.df = pd.read_csv(tsv_path, encoding='utf-8', delimiter='\t')
        self.df = self.df.loc[:, [
            'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text',
            'token_id', 'ylen', 'ydim'
        ]]
        for i in range(1, 4):
            if locals()['tsv_path_sub' + str(i)]:
                df_sub = pd.read_csv(locals()['tsv_path_sub' + str(i)],
                                     encoding='utf-8',
                                     delimiter='\t')
                df_sub = df_sub.loc[:, [
                    'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text',
                    'token_id', 'ylen', 'ydim'
                ]]
                setattr(self, 'df_sub' + str(i), df_sub)
            else:
                setattr(self, 'df_sub' + str(i), None)

        if corpus == 'swbd':
            self.df['session'] = self.df['speaker'].apply(
                lambda x: str(x).split('-')[0])
        else:
            self.df['session'] = self.df['speaker'].apply(lambda x: str(x))

        if concat_prev_n_utterances > 0 or n_caches > 0:
            max_n_frames = 10000
            min_n_frames = 1

            # Sort by onset
            self.df = self.df.assign(prev_utt='')
            if corpus == 'swbd':
                self.df['onset'] = self.df['utt_id'].apply(
                    lambda x: int(x.split('_')[-1].split('-')[0]))
            elif corpus == 'csj':
                self.df['onset'] = self.df['utt_id'].apply(
                    lambda x: int(x.split('_')[1]))
            else:
                raise NotImplementedError
            self.df = self.df.sort_values(by=['session', 'onset'],
                                          ascending=True)

            # Extract previous utterances
            if not (is_test and n_caches > 0):
                self.df = self.df.assign(line_no=list(range(len(self.df))))
                groups = self.df.groupby('session').groups  # dict
                self.df['prev_utt'] = self.df.apply(lambda x: [
                    self.df.loc[i, 'line_no'] for i in groups[x['session']]
                    if self.df.loc[i, 'onset'] < x['onset']
                ],
                                                    axis=1)
        elif is_test and corpus == 'swbd':
            # Sort by onset
            self.df['onset'] = self.df['utt_id'].apply(
                lambda x: int(x.split('_')[-1].split('-')[0]))
            self.df = self.df.sort_values(by=['session', 'onset'],
                                          ascending=True)

        if concat_prev_n_utterances > 0:
            assert n_caches == 0
            self.pad_xlen = 20

            # Truncate history
            self.df['prev_utt'] = self.df['prev_utt'].apply(
                lambda x: x[-concat_prev_n_utterances:])

            # Update xlen, ylen, text
            self.pad_xlen = 20
            self.df['xlen'] = self.df.apply(lambda x: sum([
                self.df.loc[i, 'xlen'] + self.pad_xlen for i in x['prev_utt']
            ] + [x['xlen']]) if len(x['prev_utt']) > 0 else x['xlen'],
                                            axis=1)
            # self.df['text'] = self.df.apply(
            #     lambda x: ' '.join([self.df.loc[i, 'text']
            #                         for i in x['prev_utt']] + [x['text']]) if len(x['prev_utt']) > 0 else x['text'], axis=1)

        if n_caches > 0:
            assert concat_prev_n_utterances == 0

        # Remove inappropriate utterances
        if is_test:
            print('Original utterance num: %d' % len(self.df))
            n_utts = len(self.df)
            self.df = self.df[self.df.apply(lambda x: x['ylen'] > 0, axis=1)]
            print('Removed %d empty utterances' % (n_utts - len(self.df)))
        else:
            print('Original utterance num: %d' % len(self.df))
            n_utts = len(self.df)
            self.df = self.df[self.df.apply(
                lambda x: min_n_frames <= x['xlen'] <= max_n_frames, axis=1)]
            self.df = self.df[self.df.apply(lambda x: x['ylen'] > 0, axis=1)]
            print('Removed %d utterances (threshold)' %
                  (n_utts - len(self.df)))

            if ctc and subsample_factor > 1:
                n_utts = len(self.df)
                self.df = self.df[self.df.apply(
                    lambda x: x['ylen'] <= (x['xlen'] // subsample_factor),
                    axis=1)]
                print('Removed %d utterances (for CTC)' %
                      (n_utts - len(self.df)))

            for i in range(1, 4):
                df_sub = getattr(self, 'df_sub' + str(i))
                ctc_sub = locals()['ctc_sub' + str(i)]
                subsample_factor_sub = locals()['subsample_factor_sub' +
                                                str(i)]
                if df_sub is not None:
                    if ctc_sub and subsample_factor_sub > 1:
                        df_sub = df_sub[df_sub.apply(
                            lambda x: x['ylen'] <=
                            (x['xlen'] // subsample_factor_sub),
                            axis=1)]

                    if len(self.df) != len(df_sub):
                        n_utts = len(self.df)
                        self.df = self.df.drop(
                            self.df.index.difference(df_sub.index))
                        print('Removed %d utterances (for CTC, sub%d)' %
                              (n_utts - len(self.df), i))
                        for j in range(1, i + 1):
                            setattr(
                                self, 'df_sub' + str(j),
                                getattr(self, 'df_sub' + str(j)).drop(
                                    getattr(self, 'df_sub' +
                                            str(j)).index.difference(
                                                self.df.index)))

        # Sort tsv records
        if not is_test:
            if sort_by_input_length:
                self.df = self.df.sort_values(by='xlen', ascending=short2long)
            elif shuffle:
                self.df = self.df.reindex(np.random.permutation(self.df.index))

        self.rest = set(list(self.df.index))
        self.input_dim = kaldi_io.read_mat(self.df['feat_path'][0]).shape[-1]
示例#9
0
    def make_batch(self, df_indices):
        """Create mini-batch per step.

        Args:
            df_indices (np.ndarray):
        Returns:
            batch (dict):
                xs (list): input data of size `[T, input_dim]`
                xlens (list): lengths of each element in xs
                ys (list): reference labels in the main task of size `[L]`
                ys_sub1 (list): reference labels in the 1st auxiliary task of size `[L_sub1]`
                ys_sub2 (list): reference labels in the 2nd auxiliary task of size `[L_sub2]`
                ys_sub3 (list): reference labels in the 3rd auxiliary task of size `[L_sub3]`
                utt_ids (list): name of each utterance
                speakers (list): name of each speaker
                sessions (list): name of each session

        """
        # inputs
        xs = [kaldi_io.read_mat(self.df['feat_path'][i]) for i in df_indices]
        if self.concat_prev_n_utterances > 0:
            for j, i in enumerate(df_indices):
                for idx in self.df['prev_utt'][i][::-1]:
                    x_prev = kaldi_io.read_mat(self.df['feat_path'][idx])
                    xs[j] = np.concatenate([
                        x_prev,
                        np.zeros((self.pad_xlen, self.input_dim),
                                 dtype=np.float32), xs[j]
                    ],
                                           axis=0)

        # outputs
        ys = [
            list(map(int,
                     str(self.df['token_id'][i]).split())) for i in df_indices
        ]
        if self.concat_prev_n_utterances > 0:
            for j, i in enumerate(df_indices):
                for idx in self.df['prev_utt'][i][::-1]:
                    y_prev = list(
                        map(int,
                            str(self.df['token_id'][idx]).split()))
                    ys[j] = y_prev + [self.eos] + ys[j][:]

        ys_cache = []
        if self.n_caches > 0:
            ys_cache = [[] for _ in range(len(df_indices))]
            for j, i in enumerate(df_indices):
                for idx in self.df['prev_utt'][i]:
                    y_prev = list(
                        map(int,
                            str(self.df['token_id'][idx]).split()))
                    ys_cache[j] += [self.eos] + y_prev

            # Truencate
            ys_cache = [y[-self.n_caches:] for y in ys_cache]

        ys_sub1 = []
        if self.df_sub1 is not None:
            ys_sub1 = [
                list(map(int, self.df_sub1['token_id'][i].split()))
                for i in df_indices
            ]
            if self.concat_prev_n_utterances > 0:
                for j, i in enumerate(df_indices):
                    for idx in self.df['prev_utt'][i][::-1]:
                        y_prev = list(
                            map(int,
                                str(self.df_sub1['token_id'][idx]).split()))
                        ys_sub1[j] = y_prev + [self.eos] + ys_sub1[j][:]
        elif self.vocab_sub1 > 0:
            ys_sub1 = [
                self.token2idx[1](self.df['text'][i]) for i in df_indices
            ]

        ys_sub2 = []
        if self.df_sub2 is not None:
            ys_sub2 = [
                list(map(int, self.df_sub2['token_id'][i].split()))
                for i in df_indices
            ]
            if self.concat_prev_n_utterances > 0:
                raise NotImplementedError
        elif self.vocab_sub2 > 0:
            ys_sub2 = [
                self.token2idx[2](self.df['text'][i]) for i in df_indices
            ]

        ys_sub3 = []
        if self.df_sub3 is not None:
            ys_sub3 = [
                list(map(int, self.df_sub3['token_id'][i].split()))
                for i in df_indices
            ]
            if self.concat_prev_n_utterances > 0:
                raise NotImplementedError
        elif self.vocab_sub3 > 0:
            ys_sub3 = [
                self.token2idx[3](self.df['text'][i]) for i in df_indices
            ]

        batch_dict = {
            'xs': xs,
            'xlens': [self.df['xlen'][i] for i in df_indices],
            'ys': ys,
            'ys_cache': ys_cache,
            'ys_sub1': ys_sub1,
            'ys_sub2': ys_sub2,
            'ys_sub3': ys_sub3,
            'utt_ids': [self.df['utt_id'][i] for i in df_indices],
            'speakers': [self.df['speaker'][i] for i in df_indices],
            'sessions': [self.df['session'][i] for i in df_indices],
            'text': [self.df['text'][i] for i in df_indices],
            'feat_path':
            [self.df['feat_path'][i] for i in df_indices]  # for plot
        }

        return batch_dict