Exemplo n.º 1
0
  def _load_lexicon(self):
    '''
    loads a lexicon from a file, loads the xml and returns its conent
    where:
      lex.lemmas and lex.phonemes important
    '''
    from os.path import isfile
    from Log import log
    from LmDataset import Lexicon

    assert isfile(self.lexicon_name), "Lexicon does not exists"

    log.initialize(verbosity=[5])
    self.lexicon = Lexicon(self.lexicon_name)
Exemplo n.º 2
0
def main(argv):
    argparser = argparse.ArgumentParser(description='Collect orth symbols.')
    argparser.add_argument(
        'input', help="CRNN config, Corpus Bliss XML or just txt-data")
    argparser.add_argument("--dump_orth", action="store_true")
    argparser.add_argument("--lexicon")
    args = argparser.parse_args(argv[1:])

    bliss_filename = None
    crnn_config_filename = None
    txt_filename = None
    if is_bliss(args.input):
        bliss_filename = args.input
        print("Read Bliss corpus:", bliss_filename)
    elif is_crnn_config(args.input):
        crnn_config_filename = args.input
        print("Read corpus from Returnn config:", crnn_config_filename)
    else:  # treat just as txt
        txt_filename = args.input
        print("Read corpus from txt-file:", txt_filename)
    init(configFilename=crnn_config_filename)

    if bliss_filename:
        iter_corpus = lambda cb: iter_bliss(
            bliss_filename, options=args, callback=cb)
    elif txt_filename:
        iter_corpus = lambda cb: iter_txt(
            txt_filename, options=args, callback=cb)
    else:
        iter_corpus = lambda cb: iter_dataset(
            rnn.train_data, options=args, callback=cb)
    corpus_stats = CollectCorpusStats(args, iter_corpus)

    if args.lexicon:
        print("Lexicon:", args.lexicon)
        lexicon = Lexicon(args.lexicon)
        print("Words not in lexicon:")
        c = 0
        for w in sorted(corpus_stats.words):
            if w not in lexicon.lemmas:
                print(w)
                c += 1
        print("Count: %i (%f%%)" %
              (c, 100. * float(c) / len(corpus_stats.words)))
    else:
        print("No lexicon provided (--lexicon).")

    if crnn_config_filename:
        rnn.finalize()
Exemplo n.º 3
0
  def load_lexicon(self, lexicon_name='recog.150k.final.lex.gz'):
    """
    loads Lexicon
    takes a file, loads the xml and returns as Lexicon
    where:
      lex.lemmas and lex.phonemes important
    :param str lexicon_name: holds the path and name of the lexicon file
    """
    from LmDataset import Lexicon
    from os.path import isfile
    from Log import log

    log.initialize(verbosity=[5])
    assert isfile(lexicon_name), "Lexicon file does not exist"
    self.lexicon = Lexicon(lexicon_name)
Exemplo n.º 4
0
def __load_lexicon(lexFile):
    '''
  loads a lexicon from a file, loads the xml and returns its conent
  :param lexFile: lexicon file with xml structure
  :return lex: variable with xml structure
  where:
    lex.lemmas and lex.phonemes important
  '''
    from os.path import isfile
    from Log import log
    from LmDataset import Lexicon

    assert isfile(lexFile), "Lexicon does not exists"

    log.initialize(verbosity=[5])
    lex = Lexicon(lexFile)

    return lex
Exemplo n.º 5
0
    def _load_lexicon(self, reload=False):
        '''
    loads a lexicon from a file, loads the xml and returns its content
    where:
      lex.lemmas and lex.phonemes important
    :param bool reload: should lexicon be reloaded
    '''
        from LmDataset import Lexicon
        if not isinstance(self.lexicon, Lexicon):
            reload = True

        if reload:
            from os.path import isfile
            from Log import log

            assert isfile(self.lexicon_name), "Lexicon does not exists"

            log.initialize(verbosity=[5])

            self.lexicon = Lexicon(self.lexicon_name)
def main():
    argparser = ArgumentParser()
    argparser.add_argument("file", help="by Returnn search, in 'py' format")
    argparser.add_argument("--out", required=True, help="output filename")
    args = argparser.parse_args()
    d = eval(open(args.file, "r").read())
    assert isinstance(d, dict)  # seq_tag -> bpe string
    assert not os.path.exists(args.out)
    
    lex_out = {}
    lexicon_file = "/work/asr4/zeyer/backup/switchboard/tuske-train.lex.v1_0_3.ci.gz"
    lexicon = Lexicon(lexicon_file)
    for word in lexicon.lemmas:
        list_phones = []
        for item in lexicon.lemmas[word]['phons']:
            list_phones.append(item['phon'])
        lex_out[word] = list_phones

    duplicates = {}  # phone -> count
    for word, phones in sorted(lex_out.items()):
        for phone in phones:
            if phone in duplicates:
                lex_out[word].remove(phone)
                lex_out[word].insert(0, '%s #%s' % (phone, duplicates[phone]))
                duplicates[phone] += 1
            else:
                duplicates[phone] = 1
    
    rev_lex = {v[0]: k for k,v in lex_out.items() if len(v)>0}
    with open(args.out, "w") as out:
        out.write("{\n")
        for seq_tag, txt in sorted(d.items()):
            seq = [w.strip() for w in txt.split("<eow>")]
            seq = ' '.join([rev_lex[x] if x in rev_lex else "[UNKNOWN]" for x in seq if len(x)>0]).strip()
            out.write("%r: %r,\n" %(seq_tag, seq))
        out.write("}\n")
    print("# Done.")
Exemplo n.º 7
0
def main():
    arg_parser = ArgumentParser()
    arg_parser.add_argument("--action")
    arg_parser.add_argument("--print_seq", action='store_true')
    arg_parser.add_argument("--print_allos", action='store_true')
    arg_parser.add_argument("--print_targets", action='store_true')
    arg_parser.add_argument("--dataset")
    arg_parser.add_argument("--corpus")
    arg_parser.add_argument("--lexicon", help="filename")
    arg_parser.add_argument("--silence", type=int, help="index")
    arg_parser.add_argument("--context", default=1, type=int)
    arg_parser.add_argument("--hmm_states", default=3, type=int)
    arg_parser.add_argument("--state_tying_type", help="'monophone' or 'full'")
    arg_parser.add_argument("--state_tying_output", help="filename")
    arg_parser.add_argument("--allo_add_all", action="store_true")
    args = arg_parser.parse_args()

    dataset = init_dataset_via_str(
        config_str=args.dataset) if args.dataset else None
    corpus = dict(iter_bliss_orth(
        filename=args.corpus)) if args.corpus else None
    lexicon = Lexicon(filename=args.lexicon) if args.lexicon else None
    silence_label = args.silence

    if args.action == "show_corpus":
        pprint(corpus)
        return

    print("Num phones: %i" % len(lexicon.phonemes), file=log.v1)
    print("Phones: %r" % sorted(lexicon.phonemes.keys()), file=log.v1)

    orth_handler = OrthHandler(lexicon=lexicon,
                               allo_context_len=args.context,
                               allo_num_states=args.hmm_states)
    map_idx_to_allo = defaultdict(set)  # type: dict[int, set[AllophoneState]]
    map_allo_to_idx = {}  # type: dict[AllophoneState, int]
    if args.allo_add_all:
        orth_handler.allo_add_all = True

    print("Num HMM states: %i" % orth_handler.allo_num_states, file=log.v1)
    if args.state_tying_type == "monophone":
        print("Monophone state tying.", file=log.v1)
        num_labels = orth_handler.expected_num_labels_for_monophone_state_tying(
        )
        all_label_idx_are_used = True
    elif args.state_tying_type == "full":
        print("Full state tying.", file=log.v1)
        phone_idxs = {k: i + 1
                      for (i, k) in enumerate(lexicon.phoneme_list)
                      }  # +1 to keep 0 reserved as the term-symbol
        for phon in lexicon.phoneme_list:
            for allo in orth_handler.all_allophone_variations(
                    phon, all_boundary_variations=True):
                allo_idx = allo.index(
                    phone_idxs=phone_idxs,
                    num_states=orth_handler.allo_num_states,
                    context_length=orth_handler.allo_context_len)
                map_idx_to_allo[allo_idx].add(allo)
        num_labels = max(map_idx_to_allo.keys()) + 1
        all_label_idx_are_used = False
    else:
        raise Exception("invalid state tying type %r" % args.state_tying_type)
    print("Num labels: %i" % num_labels, file=log.v1)

    if dataset:
        count = 0
        for segment_name, targets in iter_dataset_targets(dataset):
            count += 1
            if silence_label is None or count == 1:
                likely_silence_label = collections.Counter(
                    targets).most_common(1)[0][0]
                if silence_label is None:
                    silence_label = likely_silence_label
                if silence_label != likely_silence_label:
                    print("warning: silence %i but likely %i" %
                          (silence_label, likely_silence_label),
                          file=log.v2)
                print("Silence label: %i" % silence_label, file=log.v1)
                orth_handler.si_label = silence_label
                # Monophone state tying:
                for allo in orth_handler.all_allophone_variations(
                        orth_handler.si_phone):
                    map_idx_to_allo[silence_label].add(allo)
                    map_allo_to_idx[allo] = silence_label
            assert segment_name in corpus
            orth = corpus[segment_name]
            allo_states = orth_handler.orth_to_allophone_states(orth=orth)
            if args.print_seq:
                print("%r %r" % (segment_name, orth))
            if args.print_allos:
                print("  allophone state seq: %r" % allo_states)
            tgt_seq = [t for t in uniq(targets) if t != silence_label]
            if args.print_targets:
                print("  target seq: %r" % (tgt_seq, ))
            assert len(allo_states) == len(tgt_seq), "check --hmm_states or so"
            for allo, t in zip(allo_states, tgt_seq):
                allo.boundary = 0  # do not differ between boundaries
                allos = map_idx_to_allo[t]
                if allo in map_allo_to_idx:
                    assert allo in allos, "bad mapping"
                else:
                    assert allo not in allos
                    allos.add(allo)
                    map_allo_to_idx[allo] = t
            if len(map_idx_to_allo) >= num_labels:
                assert len(map_idx_to_allo) == num_labels
                assert 0 in map_idx_to_allo
                assert num_labels - 1 in map_idx_to_allo
                print("Finished with uniq mapping after %i sequences." % count,
                      file=log.v1)
                break
            if count % 100 == 0:
                print("Have indices: %i (num labels: %i)" %
                      (len(map_idx_to_allo), num_labels),
                      file=log.v1)

        print("Finished. Have indices: %i (num labels: %i)" %
              (len(map_idx_to_allo), num_labels),
              file=log.v1)
        if len(map_idx_to_allo) < num_labels:
            found = []
            not_found = []
            for p in sorted(lexicon.phonemes.keys()):
                allo = AllophoneState(p, state=0)
                if allo in map_allo_to_idx:
                    found.append(p)
                else:
                    not_found.append(p)
            print("Phonemes found: %r" % found)
            print("Phonemes not found: %r" % not_found)

    if args.state_tying_output:
        assert not os.path.exists(args.state_tying_output)
        if all_label_idx_are_used:
            assert len(map_idx_to_allo) == num_labels
            assert 0 in map_idx_to_allo
            assert num_labels - 1 in map_idx_to_allo
        f = open(args.state_tying_output, "w")
        for i, allos in sorted(map_idx_to_allo.items()):
            for allo in allos:
                f.write("%s %i\n" % (allo.format(), i))
        f.close()
        print("Wrote state tying to %r." % args.state_tying_output,
              file=log.v1)

    print("The end.")
def main():
    arg_parser = ArgumentParser()
    arg_parser.add_argument("--action")
    arg_parser.add_argument("--print_seq", action='store_true')
    arg_parser.add_argument("--print_allos", action='store_true')
    arg_parser.add_argument("--print_targets", action='store_true')
    arg_parser.add_argument("--dataset")
    arg_parser.add_argument("--corpus")
    arg_parser.add_argument("--lexicon")
    arg_parser.add_argument("--silence", type=int)
    arg_parser.add_argument("--context", default=1, type=int)
    arg_parser.add_argument("--hmm_states", default=3, type=int)
    arg_parser.add_argument("--state_tying_output")
    arg_parser.add_argument("--allo_add_all", action="store_true")
    args = arg_parser.parse_args()

    dataset = init_dataset_via_str(
        config_str=args.dataset) if args.dataset else None
    corpus = dict(iter_bliss_orth(
        filename=args.corpus)) if args.corpus else None
    lexicon = Lexicon(filename=args.lexicon) if args.lexicon else None
    silence_label = args.silence

    if args.action == "show_corpus":
        pprint(corpus)
        return

    print("Num phones: %i" % len(lexicon.phonemes), file=log.v1)
    print("Phones: %r" % sorted(lexicon.phonemes.keys()), file=log.v1)

    orth_handler = OrthHandler(lexicon=lexicon,
                               allo_context_len=args.context,
                               allo_num_states=args.hmm_states)
    map_idx_to_allo = defaultdict(set)  # type: dict[int, set[AllophoneState]]
    map_allo_to_idx = {}  # type: dict[AllophoneState, int]
    if args.allo_add_all:
        orth_handler.allo_add_all = True

    # NOTE: Assume monophone state tying for now!
    num_labels = orth_handler.expected_num_labels_for_monophone_state_tying()
    print("Num labels: %i" % num_labels, file=log.v1)

    count = 0
    for segment_name, targets in iter_dataset_targets(dataset):
        count += 1
        if silence_label is None or count == 1:
            likely_silence_label = collections.Counter(targets).most_common(
                1)[0][0]
            if silence_label is None:
                silence_label = likely_silence_label
            if silence_label != likely_silence_label:
                print("warning: silence %i but likely %i" %
                      (silence_label, likely_silence_label),
                      file=log.v2)
            print("Silence label: %i" % silence_label, file=log.v1)
            orth_handler.si_label = silence_label
            # Monophone state tying:
            for allo in orth_handler.all_allophone_variations(
                    orth_handler.si_phone):
                map_idx_to_allo[silence_label].add(allo)
                map_allo_to_idx[allo] = silence_label
        assert segment_name in corpus
        orth = corpus[segment_name]
        allo_states = orth_handler.orth_to_allophone_states(orth=orth)
        if args.print_seq:
            print("%r %r" % (segment_name, orth))
        if args.print_allos:
            print("  allophone state seq: %r" % allo_states)
        tgt_seq = [t for t in uniq(targets) if t != silence_label]
        if args.print_targets:
            print("  target seq: %r" % (tgt_seq, ))
        assert len(allo_states) == len(tgt_seq), "check --hmm_states or so"
        for allo, t in zip(allo_states, tgt_seq):
            allo.boundary = 0  # do not differ between boundaries
            allos = map_idx_to_allo[t]
            if allo in map_allo_to_idx:
                assert allo in allos, "bad mapping"
            else:
                assert allo not in allos
                allos.add(allo)
                map_allo_to_idx[allo] = t
        if len(map_idx_to_allo) >= num_labels:
            assert len(map_idx_to_allo) == num_labels
            assert 0 in map_idx_to_allo
            assert num_labels - 1 in map_idx_to_allo
            print("Finished with uniq mapping after %i sequences." % count,
                  file=log.v1)
            break
        if count % 100 == 0:
            print("Have indices: %i (num labels: %i)" %
                  (len(map_idx_to_allo), num_labels),
                  file=log.v1)

    print("Finished. Have indices: %i (num labels: %i)" %
          (len(map_idx_to_allo), num_labels),
          file=log.v1)
    if len(map_idx_to_allo) < num_labels:
        found = []
        not_found = []
        for p in sorted(lexicon.phonemes.keys()):
            allo = AllophoneState(p, state=0)
            if allo in map_allo_to_idx:
                found.append(p)
            else:
                not_found.append(p)
        print("Phonemes found: %r" % found)
        print("Phonemes not found: %r" % not_found)

    if args.state_tying_output:
        assert not os.path.exists(args.state_tying_output)
        assert len(map_idx_to_allo) == num_labels
        assert 0 in map_idx_to_allo
        assert num_labels - 1 in map_idx_to_allo
        f = open(args.state_tying_output, "w")
        for i in range(num_labels):
            phons = sorted(
                set([(allo.id, allo.state) for allo in map_idx_to_allo[i]]))
            assert len(phons) == 1
            phon, state = phons[0]
            for allo in orth_handler.all_allophone_variations(phon,
                                                              states=[state]):
                f.write("%s %i\n" % (allo, i))
        f.close()
        print("Wrote state tying to %r." % args.state_tying_output,
              file=log.v1)