def _load_lexicon(self): ''' loads a lexicon from a file, loads the xml and returns its conent where: lex.lemmas and lex.phonemes important ''' from os.path import isfile from Log import log from LmDataset import Lexicon assert isfile(self.lexicon_name), "Lexicon does not exists" log.initialize(verbosity=[5]) self.lexicon = Lexicon(self.lexicon_name)
def main(argv): argparser = argparse.ArgumentParser(description='Collect orth symbols.') argparser.add_argument( 'input', help="CRNN config, Corpus Bliss XML or just txt-data") argparser.add_argument("--dump_orth", action="store_true") argparser.add_argument("--lexicon") args = argparser.parse_args(argv[1:]) bliss_filename = None crnn_config_filename = None txt_filename = None if is_bliss(args.input): bliss_filename = args.input print("Read Bliss corpus:", bliss_filename) elif is_crnn_config(args.input): crnn_config_filename = args.input print("Read corpus from Returnn config:", crnn_config_filename) else: # treat just as txt txt_filename = args.input print("Read corpus from txt-file:", txt_filename) init(configFilename=crnn_config_filename) if bliss_filename: iter_corpus = lambda cb: iter_bliss( bliss_filename, options=args, callback=cb) elif txt_filename: iter_corpus = lambda cb: iter_txt( txt_filename, options=args, callback=cb) else: iter_corpus = lambda cb: iter_dataset( rnn.train_data, options=args, callback=cb) corpus_stats = CollectCorpusStats(args, iter_corpus) if args.lexicon: print("Lexicon:", args.lexicon) lexicon = Lexicon(args.lexicon) print("Words not in lexicon:") c = 0 for w in sorted(corpus_stats.words): if w not in lexicon.lemmas: print(w) c += 1 print("Count: %i (%f%%)" % (c, 100. * float(c) / len(corpus_stats.words))) else: print("No lexicon provided (--lexicon).") if crnn_config_filename: rnn.finalize()
def load_lexicon(self, lexicon_name='recog.150k.final.lex.gz'): """ loads Lexicon takes a file, loads the xml and returns as Lexicon where: lex.lemmas and lex.phonemes important :param str lexicon_name: holds the path and name of the lexicon file """ from LmDataset import Lexicon from os.path import isfile from Log import log log.initialize(verbosity=[5]) assert isfile(lexicon_name), "Lexicon file does not exist" self.lexicon = Lexicon(lexicon_name)
def __load_lexicon(lexFile): ''' loads a lexicon from a file, loads the xml and returns its conent :param lexFile: lexicon file with xml structure :return lex: variable with xml structure where: lex.lemmas and lex.phonemes important ''' from os.path import isfile from Log import log from LmDataset import Lexicon assert isfile(lexFile), "Lexicon does not exists" log.initialize(verbosity=[5]) lex = Lexicon(lexFile) return lex
def _load_lexicon(self, reload=False): ''' loads a lexicon from a file, loads the xml and returns its content where: lex.lemmas and lex.phonemes important :param bool reload: should lexicon be reloaded ''' from LmDataset import Lexicon if not isinstance(self.lexicon, Lexicon): reload = True if reload: from os.path import isfile from Log import log assert isfile(self.lexicon_name), "Lexicon does not exists" log.initialize(verbosity=[5]) self.lexicon = Lexicon(self.lexicon_name)
def main(): argparser = ArgumentParser() argparser.add_argument("file", help="by Returnn search, in 'py' format") argparser.add_argument("--out", required=True, help="output filename") args = argparser.parse_args() d = eval(open(args.file, "r").read()) assert isinstance(d, dict) # seq_tag -> bpe string assert not os.path.exists(args.out) lex_out = {} lexicon_file = "/work/asr4/zeyer/backup/switchboard/tuske-train.lex.v1_0_3.ci.gz" lexicon = Lexicon(lexicon_file) for word in lexicon.lemmas: list_phones = [] for item in lexicon.lemmas[word]['phons']: list_phones.append(item['phon']) lex_out[word] = list_phones duplicates = {} # phone -> count for word, phones in sorted(lex_out.items()): for phone in phones: if phone in duplicates: lex_out[word].remove(phone) lex_out[word].insert(0, '%s #%s' % (phone, duplicates[phone])) duplicates[phone] += 1 else: duplicates[phone] = 1 rev_lex = {v[0]: k for k,v in lex_out.items() if len(v)>0} with open(args.out, "w") as out: out.write("{\n") for seq_tag, txt in sorted(d.items()): seq = [w.strip() for w in txt.split("<eow>")] seq = ' '.join([rev_lex[x] if x in rev_lex else "[UNKNOWN]" for x in seq if len(x)>0]).strip() out.write("%r: %r,\n" %(seq_tag, seq)) out.write("}\n") print("# Done.")
def main(): arg_parser = ArgumentParser() arg_parser.add_argument("--action") arg_parser.add_argument("--print_seq", action='store_true') arg_parser.add_argument("--print_allos", action='store_true') arg_parser.add_argument("--print_targets", action='store_true') arg_parser.add_argument("--dataset") arg_parser.add_argument("--corpus") arg_parser.add_argument("--lexicon", help="filename") arg_parser.add_argument("--silence", type=int, help="index") arg_parser.add_argument("--context", default=1, type=int) arg_parser.add_argument("--hmm_states", default=3, type=int) arg_parser.add_argument("--state_tying_type", help="'monophone' or 'full'") arg_parser.add_argument("--state_tying_output", help="filename") arg_parser.add_argument("--allo_add_all", action="store_true") args = arg_parser.parse_args() dataset = init_dataset_via_str( config_str=args.dataset) if args.dataset else None corpus = dict(iter_bliss_orth( filename=args.corpus)) if args.corpus else None lexicon = Lexicon(filename=args.lexicon) if args.lexicon else None silence_label = args.silence if args.action == "show_corpus": pprint(corpus) return print("Num phones: %i" % len(lexicon.phonemes), file=log.v1) print("Phones: %r" % sorted(lexicon.phonemes.keys()), file=log.v1) orth_handler = OrthHandler(lexicon=lexicon, allo_context_len=args.context, allo_num_states=args.hmm_states) map_idx_to_allo = defaultdict(set) # type: dict[int, set[AllophoneState]] map_allo_to_idx = {} # type: dict[AllophoneState, int] if args.allo_add_all: orth_handler.allo_add_all = True print("Num HMM states: %i" % orth_handler.allo_num_states, file=log.v1) if args.state_tying_type == "monophone": print("Monophone state tying.", file=log.v1) num_labels = orth_handler.expected_num_labels_for_monophone_state_tying( ) all_label_idx_are_used = True elif args.state_tying_type == "full": print("Full state tying.", file=log.v1) phone_idxs = {k: i + 1 for (i, k) in enumerate(lexicon.phoneme_list) } # +1 to keep 0 reserved as the term-symbol for phon in lexicon.phoneme_list: for allo in orth_handler.all_allophone_variations( phon, all_boundary_variations=True): allo_idx = allo.index( phone_idxs=phone_idxs, num_states=orth_handler.allo_num_states, context_length=orth_handler.allo_context_len) map_idx_to_allo[allo_idx].add(allo) num_labels = max(map_idx_to_allo.keys()) + 1 all_label_idx_are_used = False else: raise Exception("invalid state tying type %r" % args.state_tying_type) print("Num labels: %i" % num_labels, file=log.v1) if dataset: count = 0 for segment_name, targets in iter_dataset_targets(dataset): count += 1 if silence_label is None or count == 1: likely_silence_label = collections.Counter( targets).most_common(1)[0][0] if silence_label is None: silence_label = likely_silence_label if silence_label != likely_silence_label: print("warning: silence %i but likely %i" % (silence_label, likely_silence_label), file=log.v2) print("Silence label: %i" % silence_label, file=log.v1) orth_handler.si_label = silence_label # Monophone state tying: for allo in orth_handler.all_allophone_variations( orth_handler.si_phone): map_idx_to_allo[silence_label].add(allo) map_allo_to_idx[allo] = silence_label assert segment_name in corpus orth = corpus[segment_name] allo_states = orth_handler.orth_to_allophone_states(orth=orth) if args.print_seq: print("%r %r" % (segment_name, orth)) if args.print_allos: print(" allophone state seq: %r" % allo_states) tgt_seq = [t for t in uniq(targets) if t != silence_label] if args.print_targets: print(" target seq: %r" % (tgt_seq, )) assert len(allo_states) == len(tgt_seq), "check --hmm_states or so" for allo, t in zip(allo_states, tgt_seq): allo.boundary = 0 # do not differ between boundaries allos = map_idx_to_allo[t] if allo in map_allo_to_idx: assert allo in allos, "bad mapping" else: assert allo not in allos allos.add(allo) map_allo_to_idx[allo] = t if len(map_idx_to_allo) >= num_labels: assert len(map_idx_to_allo) == num_labels assert 0 in map_idx_to_allo assert num_labels - 1 in map_idx_to_allo print("Finished with uniq mapping after %i sequences." % count, file=log.v1) break if count % 100 == 0: print("Have indices: %i (num labels: %i)" % (len(map_idx_to_allo), num_labels), file=log.v1) print("Finished. Have indices: %i (num labels: %i)" % (len(map_idx_to_allo), num_labels), file=log.v1) if len(map_idx_to_allo) < num_labels: found = [] not_found = [] for p in sorted(lexicon.phonemes.keys()): allo = AllophoneState(p, state=0) if allo in map_allo_to_idx: found.append(p) else: not_found.append(p) print("Phonemes found: %r" % found) print("Phonemes not found: %r" % not_found) if args.state_tying_output: assert not os.path.exists(args.state_tying_output) if all_label_idx_are_used: assert len(map_idx_to_allo) == num_labels assert 0 in map_idx_to_allo assert num_labels - 1 in map_idx_to_allo f = open(args.state_tying_output, "w") for i, allos in sorted(map_idx_to_allo.items()): for allo in allos: f.write("%s %i\n" % (allo.format(), i)) f.close() print("Wrote state tying to %r." % args.state_tying_output, file=log.v1) print("The end.")
def main(): arg_parser = ArgumentParser() arg_parser.add_argument("--action") arg_parser.add_argument("--print_seq", action='store_true') arg_parser.add_argument("--print_allos", action='store_true') arg_parser.add_argument("--print_targets", action='store_true') arg_parser.add_argument("--dataset") arg_parser.add_argument("--corpus") arg_parser.add_argument("--lexicon") arg_parser.add_argument("--silence", type=int) arg_parser.add_argument("--context", default=1, type=int) arg_parser.add_argument("--hmm_states", default=3, type=int) arg_parser.add_argument("--state_tying_output") arg_parser.add_argument("--allo_add_all", action="store_true") args = arg_parser.parse_args() dataset = init_dataset_via_str( config_str=args.dataset) if args.dataset else None corpus = dict(iter_bliss_orth( filename=args.corpus)) if args.corpus else None lexicon = Lexicon(filename=args.lexicon) if args.lexicon else None silence_label = args.silence if args.action == "show_corpus": pprint(corpus) return print("Num phones: %i" % len(lexicon.phonemes), file=log.v1) print("Phones: %r" % sorted(lexicon.phonemes.keys()), file=log.v1) orth_handler = OrthHandler(lexicon=lexicon, allo_context_len=args.context, allo_num_states=args.hmm_states) map_idx_to_allo = defaultdict(set) # type: dict[int, set[AllophoneState]] map_allo_to_idx = {} # type: dict[AllophoneState, int] if args.allo_add_all: orth_handler.allo_add_all = True # NOTE: Assume monophone state tying for now! num_labels = orth_handler.expected_num_labels_for_monophone_state_tying() print("Num labels: %i" % num_labels, file=log.v1) count = 0 for segment_name, targets in iter_dataset_targets(dataset): count += 1 if silence_label is None or count == 1: likely_silence_label = collections.Counter(targets).most_common( 1)[0][0] if silence_label is None: silence_label = likely_silence_label if silence_label != likely_silence_label: print("warning: silence %i but likely %i" % (silence_label, likely_silence_label), file=log.v2) print("Silence label: %i" % silence_label, file=log.v1) orth_handler.si_label = silence_label # Monophone state tying: for allo in orth_handler.all_allophone_variations( orth_handler.si_phone): map_idx_to_allo[silence_label].add(allo) map_allo_to_idx[allo] = silence_label assert segment_name in corpus orth = corpus[segment_name] allo_states = orth_handler.orth_to_allophone_states(orth=orth) if args.print_seq: print("%r %r" % (segment_name, orth)) if args.print_allos: print(" allophone state seq: %r" % allo_states) tgt_seq = [t for t in uniq(targets) if t != silence_label] if args.print_targets: print(" target seq: %r" % (tgt_seq, )) assert len(allo_states) == len(tgt_seq), "check --hmm_states or so" for allo, t in zip(allo_states, tgt_seq): allo.boundary = 0 # do not differ between boundaries allos = map_idx_to_allo[t] if allo in map_allo_to_idx: assert allo in allos, "bad mapping" else: assert allo not in allos allos.add(allo) map_allo_to_idx[allo] = t if len(map_idx_to_allo) >= num_labels: assert len(map_idx_to_allo) == num_labels assert 0 in map_idx_to_allo assert num_labels - 1 in map_idx_to_allo print("Finished with uniq mapping after %i sequences." % count, file=log.v1) break if count % 100 == 0: print("Have indices: %i (num labels: %i)" % (len(map_idx_to_allo), num_labels), file=log.v1) print("Finished. Have indices: %i (num labels: %i)" % (len(map_idx_to_allo), num_labels), file=log.v1) if len(map_idx_to_allo) < num_labels: found = [] not_found = [] for p in sorted(lexicon.phonemes.keys()): allo = AllophoneState(p, state=0) if allo in map_allo_to_idx: found.append(p) else: not_found.append(p) print("Phonemes found: %r" % found) print("Phonemes not found: %r" % not_found) if args.state_tying_output: assert not os.path.exists(args.state_tying_output) assert len(map_idx_to_allo) == num_labels assert 0 in map_idx_to_allo assert num_labels - 1 in map_idx_to_allo f = open(args.state_tying_output, "w") for i in range(num_labels): phons = sorted( set([(allo.id, allo.state) for allo in map_idx_to_allo[i]])) assert len(phons) == 1 phon, state = phons[0] for allo in orth_handler.all_allophone_variations(phon, states=[state]): f.write("%s %i\n" % (allo, i)) f.close() print("Wrote state tying to %r." % args.state_tying_output, file=log.v1)