def main(): desc = 'Kaldi outputs token IDs in numbers. We can map them back to ' + \ 'textual form given an ID to text mapping. Will output to stdout.' parser = common.init_argparse(desc) parser.add_argument('fname', help='File to process. We expect each line ' + \ 'to have tokens separated by whitespace, where ' + \ 'the first token is a key or name (e.g. utt name) ' + \ 'that can be skipped, and the rest are ID numbers.') parser.add_argument('id_map', help='Mapping from textual form to ID. ' + \ 'We expect each line to have two tokens separated ' + \ 'by whitespace, where the first token is the text ' + \ 'and the second token is the ID number.') args = parser.parse_args() id_map = common.make_reverse_index(io.dict_read(args.id_map)) # Check that mapping from number to text is 1-to-1 for k in id_map.keys(): if len(id_map[k]) != 1: raise ValueError('Mapping at {} not 1-1: {}'.format(k, id_map[k])) id_map[k] = id_map[k][0] with open(args.fname, 'r') as f: for line in f: ary = line.strip().split() for i in range(1, len(ary)): ary[i] = id_map[ary[i]] print ' '.join(ary)
def parse_args(self): parser = common.init_argparse(self._desc()) self._register_custom_types(parser) self._main_args(parser) self._validation_args(parser) self._data_args(parser) self._model_args(parser) self._training_args(parser) self.args = parser.parse_args() self._check_args()
def main(): desc = 'Convert phone to word alignment. Output to stdout.' parser = common.init_argparse(desc) parser.add_argument('ali_phones_with_length', help='File containing phone alignment with length ' + \ '(generated with ali-to-phones --write-lengths=true)') parser.add_argument('text', help='Kaldi word-level transcript') parser.add_argument('phone_map', help='Mapping from text to phone ID. ' + \ 'We expect each line to have two tokens separated ' + \ 'by whitespace, where the first token is the phone ' + \ 'and the second token is the ID number.') parser.add_argument('lexicon', help='Pronunciation lexicon') parser.add_argument('--sil-phones', nargs='+', default=[], help='IDs of phones regarded as silence') parser.add_argument('--sil-label', default='sil', help='Label of silence phone/word to use in output') args = parser.parse_args() alis = ali_with_length_read(args.ali_phones_with_length, ordered=True, expand=False) io.log('Loaded {} alignments'.format(len(alis))) text = io.dict_read(args.text, lst=True) io.log('Loaded transcript containing {} utterances'.format(len(text))) phone2id = io.dict_read(args.phone_map) io.log('Loaded phone2id containing {} phones'.format(len(phone2id))) id2phone = {} # We normalize the phone name so that IDs of phone variants will map to # the primary phone. For example, IDs of sil, sil_B, sil_E, sil_I, sil_S # will all map to sil. The assumption here is that anything after and # including the '_' character is not part of the primary phone name. for phone in phone2id.keys(): nphone = phone.split('_')[0] id2phone[phone2id[phone]] = nphone io.log('Total phones in id2phone: {}'.format(len(set(id2phone.values())))) lexicon = io.lexicon_read(args.lexicon) io.log('Loaded lexicon containing {} words'.format(len(lexicon))) sil_phones = set(args.sil_phones) io.log('sil_phones: {} ({}), sil_label: {}'.format( sil_phones, [id2phone[i] for i in sil_phones], args.sil_label)) for key in alis: phone_tokens, length = get_phone_tokens(alis[key], id2phone, sil_phones) if len(phone_tokens) == 0: io.log('WARNING: {} - no non-silence tokens'.format(key)) continue if key not in text: io.log('WARNING: {} not in text'.format(key)) continue phone2word_ali(key, phone_tokens, text[key], lexicon, args.sil_label, length)
def main(): desc = 'Extract features with DNN. Output to Kaldi ark.' parser = common.init_argparse(desc) parser.add_argument('model_in', help='Model that can be read by load_dnn') parser.add_argument('feats_scp', help='scp of input features') parser.add_argument('ark_out', help='Output ark file') parser.add_argument('--output-layer', type=int, default=-2, help='Layer to use for extracting features. ' + \ 'Negative index can be used. For example, ' + \ '-1 means the last layer, and so on.') parser.add_argument('--context', type=int, default=8, help='Number of context frames for splicing') parser.add_argument('--padding', default='replicate', help='What to do with out-of-bound frames. Valid ' + \ 'values: [replicate|zero]') parser.add_argument('--ivectors', help='Utterance i-vectors to append') parser.add_argument('--chunk-size', default='300m', help='Chunk size for data buffering') args = parser.parse_args() io.log('Initializing dataset') ivectors = None if args.ivectors is None else \ io.ivector_ark_read(args.ivectors, dtype=theano.config.floatX) dataset = init_dataset(args.feats_scp, args.context, args.padding, ivectors) io.log('Initializing model') dnn = load_dnn(args.model_in) # Initializing shared_ds according to chunk_size num_items = get_num_items(args.chunk_size, theano.config.floatX) max_frames = num_items / dataset.get_dim() max_utt_frames = np.max( map(dataset.get_num_frames_by_utt_name, dataset.get_utt_names())) common.CHK_GE(max_frames, max_utt_frames) x = np.zeros((max_frames, dataset.get_dim()), dtype=theano.config.floatX) io.log('...getting extraction function') extract_fn = dnn.build_extract_feat_function(args.output_layer) io.log('Got it!') io.log('** Begin outputting to {} **'.format(args.ark_out)) ark_out = KaldiWriteOut(args.ark_out) utt_names, utt_frames, total_frames = [], [], 0 for utt in dataset.get_utt_names(): frames = dataset.get_num_frames_by_utt_name(utt) if total_frames + frames > max_frames: __extract(extract_fn, ark_out, dataset, x, utt_names, utt_frames) utt_names, utt_frames, total_frames = [], [], 0 utt_names.append(utt) utt_frames.append(frames) total_frames += frames __extract(extract_fn, ark_out, dataset, x, utt_names, utt_frames) ark_out.close()
def main(): desc = 'Convert from alignment with length to regular alignments. Output to stdout.' parser = common.init_argparse(desc) parser.add_argument('ali_with_length', help='Alignment with lengths') args = parser.parse_args() ali = ali_with_length_read(args.ali_with_length, ordered=True, expand=True) io.log('Read {} aligment with lengths'.format(len(ali))) for key in ali: print '{} {}'.format(key, ' '.join(ali[key]))
def main(): desc = 'Convert from speaker i-vectors to utt-ivectors. Output to stdout.' parser = common.init_argparse(desc) parser.add_argument('spk_ivectors', help='File containing spk i-vectors.') parser.add_argument('utt2spk', help='Kaldi utt2spk mapping.') args = parser.parse_args() spk_ivectors = ivector_ark_read(args.spk_ivectors) utt2spk = io.dict_read(args.utt2spk, ordered=True) spk2utt = common.make_reverse_index(utt2spk, ordered=True) wrote = 0 for spk in spk2utt.keys(): for utt in spk2utt[spk]: print_vector(utt, spk_ivectors[spk]) wrote += 1 io.log('Wrote {} utt i-vectors for {} spks'.format(wrote, len(spk2utt)))
def main(): desc = 'Reads in a pdf alignment and output prior counts to disk.' parser = common.init_argparse(desc) parser.add_argument('alipdf', help='pdf alignment file.') parser.add_argument('output_fname', help='File to output prior counts to') parser.add_argument('--num-pdfs', type=int, help='Number of pdfs. ' + \ 'If not set, use max value in `alipdf`.') args = parser.parse_args() alipdf = io.dict_read(args.alipdf) pdfs = [] for utt in alipdf.keys(): pdfs.extend(numpy.asarray(alipdf[utt], dtype=numpy.int)) bins = numpy.bincount(pdfs, minlength=args.num_pdfs) fw = open(args.output_fname, 'w') fw.write('[ {} ]\n'.format(' '.join(numpy.asarray(bins, dtype=numpy.str)))) fw.close()
def main(): desc = 'Convert from one mapping to another. Will output to stdout.' parser = common.init_argparse(desc) parser.add_argument('fname', help='File to process. We expect each line ' + \ 'to have tokens separated by whitespace, where ' + \ 'the first token is a key or name (e.g. utt name) ' + \ 'that can be skipped, and the rest are values.') parser.add_argument('id_map', help='Mapping from one ID to another ID. ' + \ 'Each line has two tokens separated by whitespace.') args = parser.parse_args() id_map = io.dict_read(args.id_map) io.log('Read {} mappings'.format(len(id_map))) with open(args.fname, 'r') as f: for line in f: ary = line.strip().split() for i in range(1, len(ary)): ary[i] = id_map[ary[i]] print ' '.join(ary)
def main(): desc = 'Convert from utt i-vectors to spk-ivectors. NOTE: this ' + \ 'script does not check the values of utt i-vectors that belong ' + \ 'to the same spk. It will simply treat the first utt i-vector ' + \ 'it finds from a spk as the i-vector for that spk. Output to stdout.' parser = common.init_argparse(desc) parser.add_argument('utt_ivectors', help='File containing utt i-vectors.') parser.add_argument('utt2spk', help='Kaldi utt2spk mapping.') args = parser.parse_args() utt_ivectors = ivector_ark_read(args.utt_ivectors, ordered=True) utt2spk = io.dict_read(args.utt2spk) processed_spks = set() for utt in utt_ivectors.keys(): spk = utt2spk[utt] if spk in processed_spks: continue print_vector(spk, utt_ivectors[utt]) processed_spks.add(spk) io.log('Wrote {} spk i-vectors'.format(len(processed_spks)))
def main(): desc = 'Use phone alignment to generate VAD vectors. Output to stdout.' parser = common.init_argparse(desc) parser.add_argument('ali_phones_with_length', help='File containing phone alignment with length ' + \ '(generated with ali-to-phones --write-lengths=true)') parser.add_argument('silphones', help='List of phones regarded as silence') args = parser.parse_args() silphones = set(io.read_lines(args.silphones)) io.log('{} silence phones: {}'.format(len(silphones), ':'.join(silphones))) alis = ali_with_length_read(args.ali_phones_with_length, ordered=True, expand=False) io.log('Loaded {} alignments'.format(len(alis))) for key in alis: vad = [] for ali in alis[key]: phone, length = ali vad.extend([0.0 if phone in silphones else 1.0] * length) print_vector(key, vad)
def main(): desc = 'Outputs Kaldi-compatible log-likelihood to stdout using a pdnn ' + \ 'model. This mimics the design of Kaldi nnet-forward. Use this ' + \ 'for networks that cannot be converted to Kaldi, e.g. factored model' parser = common.init_argparse(desc) parser.add_argument('model_in', help='Model that can be read by load_dnn') parser.add_argument('feats_scp', help='scp of input features') parser.add_argument('--context', type=int, default=8, help='Number of context frames for splicing') parser.add_argument('--padding', default='replicate', help='What to do with out-of-bound frames. Valid ' + \ 'values: [replicate|zero]') parser.add_argument('--class-frame-counts', help='Kaldi vector with ' + \ 'frame-counts of pdfs to compute log-priors') parser.add_argument('--prior-floor', type=float, default=1e-10, help='Flooring constant for prior probability, ' + \ 'i.e. pdfs with prior smaller than this ' + \ 'value will be ignored during decoding.') parser.add_argument('--ivectors', help='Utterance i-vectors to append') parser.add_argument('--chunk-size', default='300m', help='Chunk size for data buffering') args = parser.parse_args() io.log('Initializing dataset') ivectors = None if args.ivectors is None else \ ivector_ark_read(args.ivectors, dtype=theano.config.floatX) dataset = init_dataset(args.feats_scp, args.context, args.padding, ivectors) io.log('Initializing model') dnn = load_dnn(args.model_in) io.log('Initializing priors') log_priors = get_log_priors(args.class_frame_counts, args.prior_floor) # Initializing shared_ds according to chunk_size num_items = get_num_items(args.chunk_size, theano.config.floatX) max_frames = num_items / dataset.get_dim() max_utt_frames = np.max( map(dataset.get_num_frames_by_utt_name, dataset.get_utt_names())) common.CHK_GE(max_frames, max_utt_frames) x = np.zeros((max_frames, dataset.get_dim()), dtype=theano.config.floatX) shared_x = theano.shared(x, name='x', borrow=True) io.log('Using shared_x with size {} ({})'.format(x.shape, args.chunk_size)) io.log('...getting output function') output_fn = dnn.build_output_function(shared_x) io.log('Got it!') io.log('** Begin outputting **') utt_names, utt_frames, total_frames = [], [], 0 for utt in dataset.get_utt_names(): frames = dataset.get_num_frames_by_utt_name(utt) if total_frames + frames > max_frames: __nnet_fwd(output_fn, dataset, x, shared_x, utt_names, utt_frames, log_priors) utt_names, utt_frames, total_frames = [], [], 0 utt_names.append(utt) utt_frames.append(frames) total_frames += frames __nnet_fwd(output_fn, dataset, x, shared_x, utt_names, utt_frames, log_priors)
import chaipy.common as common import chaipy.io as io from chaipy.data.temporal import TemporalData def main(args): ds = TemporalData.from_kaldi(args.scp) io.log('Loaded dataset containing {} utts'.format(len(ds.get_utt_names()))) utt2label = io.dict_read(args.utt2label) io.log('Loaded utt2label containing {} entries'.format(len(utt2label))) for utt_name in ds.get_utt_names(): if utt_name not in utt2label: io.log('WARNING: {} not in utt2label, skipping'.format(utt_name)) lbl = utt2label[utt_name] dur = ds.get_num_frames_by_utt_name(utt_name) print '{} {}'.format(utt_name, ' '.join([lbl] * dur)) if __name__ == '__main__': desc = 'Takes in a Kaldi scp and utterance-level labels, outputs ' + \ 'frame-level labels of all utterances in the scp to stdout. ' + \ 'Utterances that are in the scp but not in the label mapping ' + \ 'will be skipped.' parser = common.init_argparse(desc) parser.add_argument('scp', help='Kaldi scp') parser.add_argument('utt2label', help='Mapping from utterance to label') main(parser.parse_args())
def main(): desc = 'Outputs Kaldi-compatible log-likelihood to stdout using a ' + \ 'Keras model. This mimics the design of Kaldi nnet-forward.' parser = common.init_argparse(desc) parser.add_argument('model_json', help='JSON description of the model') parser.add_argument('model_weights', help='File containing model weights') parser.add_argument('feats_scp', help='scp of input features') parser.add_argument('--context', type=int, default=8, help='Number of context frames for splicing') parser.add_argument('--padding', default='replicate', help='What to do with out-of-bound frames. Valid ' + \ 'values: [replicate|zero]') parser.add_argument('--primary-task', type=int, help='Set to enable multi-task model decoding') parser.add_argument('--nutts', type=int, default=10, help='How many utterances to feed to the model at once') parser.add_argument('--delay', type=int, default=5, help='Output delay in frames') parser.add_argument('--class-frame-counts', help='Kaldi vector with ' + \ 'frame-counts of pdfs to compute log-priors') parser.add_argument('--prior-floor', type=float, default=1e-10, help='Flooring constant for prior probability, ' + \ 'i.e. pdfs with prior smaller than this ' + \ 'value will be ignored during decoding.') parser.add_argument('--ivectors', help='Utterance i-vectors to append') args = parser.parse_args() io.log('Initializing dataset') ivectors = None if args.ivectors is None else \ ivector_ark_read(args.ivectors, dtype=np.float32) buf_ds = init_dataset( args.feats_scp, args.context, args.padding, args.nutts, args.delay, ivectors ) io.log('Initializing model') json_str = io.json_load(args.model_json) model = model_from_json(json_str) model.load_weights(args.model_weights) io.log('Initializing priors') log_priors = get_log_priors(args.class_frame_counts, args.prior_floor) if args.primary_task is not None: io.log('Multi-task decoding enabled, primary task {}'.format(args.primary_task)) io.log('** Begin outputting **') while True: # Load data chunk chunk = buf_ds.read_next_chunk() if chunk is None: break Xs, _, eobs, utt_indices = chunk X = Xs[0] eob = eobs[0] utt_names = buf_ds.dataset().get_utt_names_by_utt_indices(utt_indices) y = model.predict(X, batch_size=len(utt_indices), verbose=0) if args.primary_task is not None: y = y[args.primary_task] y = np.log(y, y) if log_priors is not None: y -= log_priors for i in range(len(utt_indices)): print_matrix(utt_names[i], y[i][buf_ds.get_delay():eob[i]])