示例#1
0
    def run(self):
        with uopen(self.cluster_names, "rt") as f:
            cluster_names = [l.strip() for l in f]

        clusters = collections.defaultdict(set)
        with uopen(self.cluster_map, "rt") as f:
            t = ET.parse(f)
            for mi in t.findall(".//map-item"):
                k = mi.attrib["key"]
                v = int(mi.attrib["value"].split(".")[-1]) - 1
                clusters[cluster_names[v]].add(k)

        c = corpus.Corpus()
        c.load(tk.uncached_path(self.corpus_file))

        original_segments = {}
        for s in c.segments():
            original_segments[s.fullname()] = s

        audio = {}
        transcriptions = {}
        for cluster_name in clusters:
            clusters[cluster_name] = list(sorted(clusters[cluster_name]))
            transcriptions[cluster_name] = " ".join(
                original_segments[s].orth for s in clusters[cluster_name])
            audio[cluster_name] = [
                (r.audio, s.start, s.end) for n in clusters[cluster_name]
                for s, r in [(original_segments[n],
                              original_segments[n].recording)]
            ]

        new_c = corpus.Corpus()
        new_c.name = c.name
        for cluster_name, audio_files in audio.items():
            out_path = os.path.join(self.audio_output.get_path(),
                                    cluster_name + ".wav")
            if os.path.exists(out_path):
                os.unlink(out_path)
            with open(f"{cluster_name}.txt", "wt") as f:
                for af in audio_files:
                    f.write(f"file {af[0]}\ninpoint {af[1]}\n")
                    if not math.isinf(af[2]):
                        f.write(f"outpoint {af[2]}\n")
            self.sh(
                f"ffmpeg -loglevel fatal -hide_banner -f concat -safe 0 -i '{cluster_name}.txt' '{out_path}'"
            )

            r = corpus.Recording()
            r.name = cluster_name
            r.audio = out_path
            s = corpus.Segment()
            s.name = "1"
            s.start = 0.0
            s.end = float("inf")
            s.orth = transcriptions[cluster_name]
            r.add_segment(s)

            new_c.add_recording(r)

        new_c.dump(self.output_corpus.get_path())
示例#2
0
    def run(self):
        with uopen(self.bliss_lexicon, "rt") as f:
            tree = ET.parse(f)
        with uopen(self.out_g2p_lexicon, "wt") as out:
            all_lemmas = tree.findall(".//lemma")
            assert (
                len(all_lemmas) > 0
            ), "No lemma tag found in the lexicon file! Wrong format file?"

            for lemma in all_lemmas:
                if lemma.get("special") is not None:
                    continue

                orth = lemma.find("orth").text.strip()

                if self.include_pronunciation_variants:
                    phons = lemma.findall("phon")
                    phon_single = []
                    for phon in phons:
                        p = phon.text.strip()
                        if p not in phon_single:
                            phon_single.append(p)
                            out.write("%s %s\n" % (orth, p))
                else:
                    phon = lemma.find("phon").text.strip()
                    out.write("%s %s\n" % (orth, phon))
示例#3
0
 def run(self):
     with util.uopen(self.minuend, "rt") as fin:
         file_set1 = set(fin.read().split("\n"))
     with util.uopen(self.subtrahend, "rt") as fin:
         file_set2 = set(fin.read().split("\n"))
     with util.uopen(self.out_file, "wt") as fout:
         fout.write("\n".join(sorted(file_set1.difference(file_set2))))
示例#4
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        dictionary = {}

        segments = None
        if self.segment_file:
            with uopen(self.segment_file) as f:
                segments = set(line.decode().strip() for line in f)

        for segment in c.segments():
            orth = segment.orth.strip()
            key = segment.fullname()
            if segments:
                if (not self.invert_match and key not in segments
                        and segment.name not in segments):
                    continue
                if self.invert_match and key in segments:
                    continue
            dictionary[key] = orth

        dictionary_string = pprint.pformat(dictionary, width=1000)
        with uopen(self.out_dictionary, "wt") as f:
            f.write(dictionary_string)
示例#5
0
 def run(self):
     with uopen(self.raw_dict_file) as read_f, uopen(self.out_dict,
                                                     "w") as out_f:
         for line in read_f.readlines()[1:]:
             if line.startswith("#"):  # skip comment
                 continue
             parts = line.strip().split(" ", 1)
             if len(parts) < 2:
                 continue
             token = parts[0].replace("&amp;", "&")  # e.g A&amp;E -> A&E
             mapped_token = _map_token(token)  # preprocessing as corpus
             out_f.write(mapped_token + " " + parts[1] + "\n")
示例#6
0
    def run(self):
        for idx in range(self.num_ramp_ups + 1):
            if os.path.exists(self.out_g2p_models[idx].get_path()):
                continue

            args = [
                str(self.g2p_python),
                str(self.g2p_path),
                "-e",
                "utf-8",
                "-i",
                str(self.min_iter),
                "-I",
                str(self.max_iter),
                "-d",
                self.devel,
                "-s",
                self.size_constrains,
                "-n",
                "tmp-model",
                "-S",
                "-t",
                self.g2p_lexicon.get_path(),
            ]
            if idx > 0:
                args += ["-r", "-m", self.out_g2p_models[idx - 1].get_path()]
            args += self.extra_args

            if os.path.exists("tmp-model"):
                os.unlink("tmp-model")

            with uopen("stdout.%d" % idx, "w") as out:
                sp.check_call(args, stdout=out)

            with uopen("stdout.%d" % idx, "rt") as log:
                for line in log:
                    if "total symbol errors" in line:
                        error_rate = float(line.split("(")[1].split("%")[0])
                        self.out_error_rates[idx].set(error_rate)

            os.rename("tmp-model", self.out_g2p_models[idx].get_path())

        best = min(
            ((idx, err_var.get())
             for idx, err_var in enumerate(self.out_error_rates)),
            key=lambda t: t[1],
        )
        os.symlink("model-%d" % best[0], self.out_best_model.get_path())
        self.out_best_error_rate.set(best[1])
示例#7
0
    def run(self):
        c = corpus.Corpus()
        c.name = self.name

        with uopen(self.metadata, "rt") as metadata_file:
            for line in metadata_file:
                name, text, processed_text = line.split("|")
                audio_file_path = os.path.join(
                    self.audio_folder.get_path(), name + ".wav"
                )
                assert os.path.isfile(
                    audio_file_path
                ), "Audio file %s was not found in provided audio path %s" % (
                    audio_file_path,
                    self.audio_folder.get_path(),
                )

                recording = corpus.Recording()
                recording.name = name
                recording.audio = audio_file_path
                segment = corpus.Segment()
                segment.orth = processed_text.strip()
                segment.name = name

                wave_info = wave.open(audio_file_path)
                segment.start = 0
                segment.end = wave_info.getnframes() / wave_info.getframerate()
                wave_info.close()

                recording.add_segment(segment)
                c.add_recording(recording)

        c.dump(self.out_bliss_corpus.get_path())
示例#8
0
    def run(self):
        train_binary = os.path.join(
            tk.uncached_path(self.subword_nmt_repo), "subword_nmt/learn_bpe.py"
        )
        args = [
            sys.executable,
            train_binary,
            "-o",
            self.out_code_file.get_path(),
            "-s",
            str(self.symbols),
            "--min-frequency",
            str(self.min_frequency),
        ]
        if self.dict_input:
            args += ["--dict-input"]
        if self.total_symbols:
            args += ["--total-symbols"]

        text_corpus = self.text_corpus.get_path()

        with util.uopen(text_corpus, "rb") as f:
            p = sp.Popen(args, stdin=sp.PIPE, stdout=sys.stdout, stderr=sys.stderr)
            while True:
                data = f.read(4096)
                if len(data) > 0:
                    p.stdin.write(data)
                else:
                    break

            p.communicate()
            assert p.returncode == 0
示例#9
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        if self.segment_file:
            with uopen(self.segment_file, "rt") as f:
                segments_whitelist = set(l.strip() for l in f.readlines()
                                         if len(l.strip()) > 0)
        else:
            segments_whitelist = None

        with uopen(self.out_txt.get_path(), "wt") as f:
            for segment in c.segments():
                if (not segments_whitelist) or (segment.fullname()
                                                in segments_whitelist):
                    f.write(segment.orth + "\n")
示例#10
0
    def run(self):
        lex = lexicon.Lexicon()

        phonemes = set()
        seen_lemma = {}
        with uopen(self.text_file.get_path()) as f:
            for line in f:
                # splitting is taken from RASR
                # src/Tools/Bliss/blissLexiconLib.py#L185
                s = line.split(None, 1)
                orth = s[0].split("\\", 1)[0]
                phon_variants = [
                    tuple(p.split()) for p in s[1].split("\\") if p.strip()
                ]
                for phon_variant in phon_variants:
                    phonemes.update(phon_variant)
                phon = [" ".join(v) for v in phon_variants]
                if orth in seen_lemma:
                    lemma = seen_lemma[orth]
                    for p in phon:
                        if p not in lemma.phon:
                            lemma.phon.append(p)
                else:
                    lemma = lexicon.Lemma(orth=[orth], phon=phon)
                    seen_lemma[orth] = lemma
                    lex.add_lemma(lemma)

        for phoneme in sorted(phonemes):
            lex.add_phoneme(phoneme)

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
示例#11
0
 def _read_ngrams(self, n):
     """
     Read the ngrams knowing start and end lines
     """
     with util.uopen(self.lm_path, "rt", encoding="utf-8") as infile:
         infile.seek(self.ngrams_start[n - 1][1])
         i = self.ngrams_start[n - 1][0] - 1
         while i < self.ngrams_end[n - 1]:
             i += 1
             text = infile.readline()
             entry = text.split()
             prob = float(entry[0])
             if len(entry) > n + 1:
                 back = float(entry[-1])
                 words = entry[1 : n + 1]
             else:
                 back = 0.0
                 words = entry[1:]
             ngram = " ".join(words)
             if (n == 1) and words[0] == "<s>":
                 self.sentprob = prob
                 prob = 0.0
             if i - (self.ngrams_start[n - 1][0] - 1) % 1000 == 0:
                 logging.info(f"Read 1000 {n}grams")
             yield ngram, (prob, back)
示例#12
0
    def run(self):
        segment_dict = {}
        for task_id, log_file in self.alignment_logs.items():
            logging.info("Reading: {}".format(log_file))
            file_path = tk.uncached_path(log_file)
            document = ET.parse(uopen(file_path))
            _seg_list = document.findall(".//segment")
            for seg in _seg_list:
                avg = seg.find(".//score/avg")
                segment_dict[seg.attrib["full-name"]] = float(avg.text)
            del document

        logging.info("Scores has {} entries.".format(len(segment_dict)))
        score_np = np.asarray(list(segment_dict.values()))
        logging.info("Max {}; Min {}; Median {}".format(
            score_np.max(), score_np.min(), np.median(score_np)))
        avg_score_threshold = np.percentile(score_np, self.percentile)
        if np.isnan(avg_score_threshold):
            avg_score_threshold = np.inf
        logging.info("Avg Threshold is {} with percentile {}".format(
            avg_score_threshold, self.percentile))
        if self.absolute_threshold is not None:
            avg_score_threshold = min(avg_score_threshold,
                                      self.absolute_threshold)
        logging.info("Threshold is {}".format(avg_score_threshold))

        if self.plot:
            import matplotlib

            matplotlib.use("Agg")
            import matplotlib.pyplot as plt

            plot_percentile = np.percentile(score_np,
                                            90)  # there can be huge outliers
            np.clip(score_np, 0, 200, out=score_np)
            plt.hist(score_np, bins=100, range=(0, 200))
            plt.xlabel("Average Maximum-Likelihood Score")
            plt.ylabel("Number of Segments")
            plt.title("Histogram of Alignment Scores")
            plt.savefig(fname=self.out_plot_avg.get_path())

        # Only keep segments that are below the threshold
        filtered_segments = [
            seg for seg, avg in segment_dict.items()
            if avg <= avg_score_threshold
        ]
        logging.info("Have {} entries after filtering.".format(
            len(filtered_segments)))

        for idx, segments in enumerate(
                chunks(filtered_segments, self.num_segments)):
            with open(self.out_single_segment_files[idx + 1].get_path(),
                      "wt") as segment_file:
                for segment in segments:
                    segment_file.write(segment + "\n")

        with open(self.out_single_file.get_path(), "wt") as segment_file:
            for segment in filtered_segments:
                segment_file.write(segment + "\n")
示例#13
0
    def run(self):
        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        vocab = {k: v for v, k in enumerate(lex.phonemes.keys())}
        pickle.dump(vocab, uopen(self.out_vocab, "wb"))

        print("Vocab Size: %i" % len(lex.phonemes))
        self.out_vocab_size.set(len(lex.phonemes))
示例#14
0
    def run(self):
        bpe_codes_cmd = [
            sys.executable,
            os.path.join(tk.uncached_path(self.subword_nmt_repo), "learn_bpe.py"),
            "--output",
            self.out_bpe_codes.get_path(),
            "--symbols",
            str(self.bpe_size),
        ]

        util.create_executable("create_bpe_codes.sh", bpe_codes_cmd)

        with util.uopen(self.text_file, "rb") as f:
            p = sp.Popen(
                bpe_codes_cmd, stdin=sp.PIPE, stdout=sys.stdout, stderr=sys.stderr
            )
            while True:
                data = f.read(4096)
                if len(data) > 0:
                    p.stdin.write(data)
                else:
                    break
            p.communicate()
            assert p.returncode == 0

        bpe_vocab_cmd = [
            sys.executable,
            os.path.join(tk.uncached_path(self.subword_nmt_repo), "create-py-vocab.py"),
            "--txt",
            self.text_file.get_path(),
            "--bpe",
            self.out_bpe_codes.get_path(),
            "--unk",
            self.unk_label,
            "--out",
            self.out_bpe_vocab.get_path(),
        ]

        util.create_executable("create_bpe_vocab.sh", bpe_vocab_cmd)
        sp.run(bpe_vocab_cmd, check=True)

        with util.uopen(self.out_bpe_vocab) as f:
            num_labels = max(list(eval(f.read()).values())) + 1  # 0-based index
            self.out_vocab_size.set(num_labels)
示例#15
0
 def run(self):
     d = eval(util.uopen(self.search_py_output, "r").read())
     assert isinstance(d, dict)  # seq_tag -> bpe string
     assert not os.path.exists(self.out_word_search_results.get_path())
     with util.uopen(self.out_word_search_results, "w") as out:
         out.write("{\n")
         for seq_tag, entry in sorted(d.items()):
             if "#" in seq_tag:
                 tag_split = seq_tag.split("/")
                 recording_name, segment_name = tag_split[2].split("#")
                 seq_tag = tag_split[0] + "/" + recording_name + "/" + segment_name
             if isinstance(entry, list):
                 # n-best list as [(score, text), ...]
                 out.write("%r: [\n" % (seq_tag))
                 for score, text in entry:
                     out.write("(%f, %r),\n" % (score, text.replace("@@ ", "")))
                 out.write("],\n")
             else:
                 out.write("%r: %r,\n" % (seq_tag, entry.replace("@@ ", "")))
         out.write("}\n")
示例#16
0
    def filter(self):
        handle, tmp_path = mkstemp(dir=".", text=True)
        with uopen(self.out_g2p_lexicon,
                   "rt") as lex, os.fdopen(handle, "wt") as fd_out:
            for line in lex:
                if len(line.strip().split("\t")) == 4:
                    fd_out.write(line)
        fd_out.close()

        os.remove(self.out_g2p_lexicon)
        os.rename(tmp_path, self.out_g2p_lexicon)
示例#17
0
 def run(self):
     corpus = Corpus()
     corpus.load(self.bliss_corpus.get_path())
     d = eval(util.uopen(self.recog_words_file.get_path(), "r").read())
     assert isinstance(
         d, dict
     ), "only search output file with dict format is supported"
     with util.uopen(self.out_ctm_file.get_path(), "w") as out:
         out.write(
             ";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n"
         )
         for seg in corpus.segments():
             seg_start = 0.0 if seg.start == float("inf") else seg.start
             seg_end = 0.0 if seg.end == float("inf") else seg.end
             seg_fullname = seg.fullname()
             assert seg_fullname in d, "can not find {} in search output".format(
                 seg_fullname
             )
             out.write(";; %s (%f-%f)\n" % (seg_fullname, seg_start, seg_end))
             assert isinstance(
                 d[seg_fullname], str
             ), "no support for n-best lists yet"
             words = d[seg_fullname].split()
             # Just linearly interpolate the start/end of each word as time stamps are not given
             avg_dur = (seg_end - seg_start) * 0.9 / max(len(words), 1)
             for i in range(len(words)):
                 if (
                     self.filter_tags
                     and words[i].startswith("[")
                     and words[i].endswith("]")
                 ):
                     continue
                 out.write(
                     "%s 1 %f %f %s 0.99\n"
                     % (
                         seg.recording.name,
                         seg_start + avg_dur * i,
                         avg_dur,
                         words[i],
                     )
                 )
示例#18
0
    def run(self):
        speaker_id, gender, rec_name = None, None, None
        with uopen(self.speakers_stats_file) as read_f, uopen(
                self.out_speakers_list, "w") as out_f:
            for line in read_f:
                l = line.strip().split()
                if len(l) < 2:
                    continue
                if l[1] == "F" or l[1] == "M":  # start new speaker
                    speaker_id = l[0]
                    gender = l[1]
                    rec_name = l[2]
                elif l[0].endswith("A") or l[0].endswith(
                        "B"):  # recording name
                    rec_name = l[0]
                else:
                    continue

                if speaker_id:
                    out_f.write(speaker_id + " " + gender + " " + rec_name +
                                "\n")  # speaker_id gender recording
示例#19
0
 def run(self):
     with uopen(self.out_g2p_lexicon, "wt") as out:
         with uopen(self.out_g2p_untranslated, "wt") as err:
             sp.check_call(
                 [
                     str(self.g2p_python),
                     str(self.g2p_path),
                     "-e",
                     "utf-8",
                     "-V",
                     str(self.variants_mass),
                     "--variants-number",
                     str(self.variants_number),
                     "-m",
                     self.g2p_model.get_path(),
                     "-a",
                     self.word_list.get_path(),
                 ],
                 stdout=out,
                 stderr=err,
             )
示例#20
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        if self.segment_file:
            with uopen(self.segment_file.get_path(), "rt") as f:
                segments_whitelist = set(l.strip() for l in f.readlines()
                                         if len(l.strip()) > 0)
            segment_iterator = filter(
                lambda s: s.fullname() in segments_whitelist, c.segments())
        else:
            segment_iterator = c.segments()

        with uopen(self.text_file, "rt") as f:
            for segment, line in itertools.zip_longest(segment_iterator, f):
                assert (segment is not None
                        ), "there were more text file lines than segments"
                assert line is not None, "there were less text file lines than segments"
                assert len(line) > 0
                segment.orth = line.strip()

        c.dump(self.out_corpus.get_path())
示例#21
0
    def run(self):
        transform = (lambda s: s.lower()) if self.case_sensitive else (lambda s: s)

        with uopen(tk.uncached_path(self.bliss_lexicon), "r") as lexicon_file:
            old_lexicon = ET.fromstring(lexicon_file.read())

        with uopen(tk.uncached_path(self.word_list), "r") as words_file:
            words = set([transform(w.strip()) for w in words_file.readlines()])

        root = ET.Element("lexicon")
        root.append(old_lexicon.find("phoneme-inventory"))
        for lemma in old_lexicon.findall("lemma"):
            if any(
                transform(orth.text) in words
                or (orth.text is not None and orth.text.startswith("["))
                for orth in lemma.findall("orth")
            ):
                root.append(lemma)

        with uopen(self.out_bliss_lexicon.get_path(), "wt") as lexicon_file:
            lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n')
            lexicon_file.write(ET.tostring(root, "unicode"))
示例#22
0
    def run(self):
        tag_map = {}

        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        all_tags = [("d%d" % i, "default%d" % i,
                     "all other segments of category %d" % i)
                    for i in range(len(self.tag_mapping) + 1)]

        for segment in c.segments():
            tag_map[segment.fullname()] = [
                "d%d" % i for i in range(len(self.tag_mapping) + 1)
            ]

        for i, (tag, segments) in enumerate(self.tag_mapping):
            all_tags.append(tag)
            for file in segments.values():
                for segment in uopen(file):
                    if segment.rstrip() in tag_map:
                        tag_map[segment.rstrip()][i] = tag[0]

        with uopen(self.out_stm_path, "wt") as out:
            for segment in c.segments():
                speaker_name = (segment.speaker().name if segment.speaker()
                                is not None else segment.recording.name)
                segment_track = segment.track + 1 if segment.track else 1
                out.write("%s %d %s %5.2f %5.2f <%s> %s\n" % (
                    segment.recording.name,
                    segment_track,
                    speaker_name,
                    segment.start,
                    segment.end,
                    ",".join(tag_map[segment.fullname()]),
                    segment.orth,
                ))
            for tag in all_tags:
                out.write(';; LABEL "%s" "%s" "%s"\n' % tag)
示例#23
0
    def run(self):
        with uopen(self.g2p_lexicon, "rt", encoding="utf-8") as f:
            oov_words = dict()
            for orth, data in it.groupby(
                    map(lambda line: line.strip().split("\t"), f),
                    lambda t: t[0]):
                oov_words[orth] = []
                for d in data:
                    if len(d) == 4:
                        oov_words[orth].append(d[3])
                    elif len(d) < 4:
                        logging.warning(
                            'No pronunciation found for orthography "{}"'.
                            format(orth))
                    else:
                        logging.warning(
                            'Did not fully parse entry for orthography "{}"'.
                            format(orth))

        with uopen(self.iv_bliss_lexicon, "rt") as f:
            iv_lexicon = ET.parse(f)

        if self.merge:
            root = iv_lexicon.getroot()
        else:
            root = ET.Element("lexicon")
            root.append(iv_lexicon.find("phoneme-inventory"))

        for orth, prons in oov_words.items():
            lemma = ET.SubElement(root, "lemma")
            ET.SubElement(lemma, "orth").text = orth
            for pron in prons:
                ET.SubElement(lemma, "phon").text = pron

        with uopen(self.out_oov_lexicon, "wt", encoding="utf-8") as f:
            tree = ET.ElementTree(root)
            tree.write(f, "unicode", True)
示例#24
0
    def run(self):
        with uopen(tk.uncached_path(self.bliss_lexicon), "r") as lexicon_file:
            lexicon = ET.fromstring(lexicon_file.read())
            words = set()
            for e in lexicon.findall("./lemma/orth"):
                if (
                    e.text is not None
                    and len(e.text) > 0
                    and not (e.text.startswith("[") and self.apply_filter)
                ):
                    words.add(e.text)

        with open(self.out_word_list.get_path(), "w") as word_file:
            for w in sorted(words):
                word_file.write("%s\n" % w)
示例#25
0
    def run(self):
        c = libcorpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        words = Counter()
        for s in c.segments():
            words.update(s.orth.strip().split())

        counts = [(v, k) for k, v in words.items()]
        with uopen(self.out_word_counts, "wt") as f:
            f.write(
                "\n".join(
                    "%d\t%s" % t for t in sorted(counts, key=lambda t: (-t[0], t[1]))
                )
            )
示例#26
0
    def run(self):
        if self.casing != "none":
            logging.warning(
                "The orthography/lemma casing is changed. Is this what you want? Normally you should set this to 'none'. For legacy reasons this is set to 'upper'."
            )

        def change_casing(text_str):
            if self.casing == "upper":
                return text_str.upper()
            elif self.casing == "lower":
                return text_str.lower()
            elif self.casing == "none":
                return text_str
            else:
                raise NotImplementedError

        with uopen(self.bliss_lexicon, "rt", encoding="utf-8") as f:
            tree = ET.parse(f)
            iv_words = {
                change_casing(orth.text)
                for orth in tree.findall(".//lemma/orth")
                if orth.text
            }

        with uopen(self.bliss_corpus, "rt", encoding="utf-8") as f:
            tree = ET.parse(f)
            oov_words = {
                w
                for kw in tree.findall(".//recording/segment/orth")
                for w in kw.text.strip().split()
                if change_casing(w) not in iv_words
            }

        with uopen(self.out_oov_words, "wt") as f:
            for w in sorted(oov_words):
                f.write("%s\n" % w)
示例#27
0
    def run(self):

        state_tying_dict = {}
        with uopen(self.state_tying_file, 'r') as f:
            for line in f:
                if not line.strip(): continue
                tokens = line.split()
                assert len(tokens) >= 2, line
                state_tying_dict[tokens[0]] = ' '.join(tokens[1:])

        vocab_dict = {}
        max_id = 0
        for allo, index in state_tying_dict.items():
            phon = allo.split('{')[0]
            if not phon.startswith('[') and not phon.endswith(']'):
                # only eow for single-phon pronunciation, i.e. @i@f
                if self.add_eow and ('@f' in allo):
                    phon += '#'
                elif self.add_sow and ('@i' in allo):
                    phon = '#' + phon
            if phon in vocab_dict:
                assert vocab_dict[
                    phon] == index, "index conflict for %s: %s vs. %s (%s %s)" % (
                        phon, index, vocab_dict[phon], allo, index)
            else:
                vocab_dict[phon] = index
            if int(index) > max_id:
                max_id = int(index)

        with uopen(self.out_label_file, 'w') as f:
            for v in sorted(vocab_dict.keys()):
                f.write(v + ' ' + vocab_dict[v] + '\n')

        print('number of classes:', len(set(vocab_dict.values())))
        assert len(set(vocab_dict.values(
        ))) == max_id + 1, "expected number of classes %d" % (max_id + 1)
示例#28
0
 def _get_speakers(self):
     """
     Extract the speakers from the SPEAKERS.TXT file
     """
     with uopen(self.speaker_metadata, "r") as speakersfile:
         for line in speakersfile:
             if line[0] == ";":
                 continue
             procline = list(map(str.strip, line.split("|")))
             self._speakers[int(procline[0])] = [
                 procline[1],
                 procline[2],
                 float(procline[3]),
                 procline[4],
             ]
示例#29
0
    def prior_and_plot(self):
        all_scores = []
        all_frames = []
        max_emission = 0
        for l in self.out_log_file.values():
            with util.uopen(l.get_path(), "rb") as f:
                tree = ET.parse(f)
            scores = {}
            activations = tree.find(".//activations")
            all_frames.append(int(activations.find("num_frames").text.strip()))
            for score_elem in activations.findall("score"):
                emission = int(score_elem.attrib["emission"])
                max_emission = max(max_emission, emission)
                score = float(score_elem.text.strip())
                scores[emission] = score
            all_scores.append(scores)

        total_frames = sum(all_frames)
        merged_scores = [0.0 for _ in range(max_emission + 1)]
        for frames, scores in zip(all_frames, all_scores):
            scale = frames / total_frames
            for k, v in scores.items():
                merged_scores[k] += scale * v

        total_mass = sum(merged_scores)
        merged_scores = [s / total_mass for s in merged_scores]

        with open(self.out_prior.get_path(), "wt") as f:
            f.write(
                '<?xml version="1.0" encoding="UTF-8"?>\n<vector-f32 size="%d">\n'
                % len(merged_scores))
            f.write(" ".join("%.20e" % math.log(s)
                             for s in merged_scores) + "\n")
            f.write("</vector-f32>")

        if self.plot_prior:
            import matplotlib

            matplotlib.use("Agg")
            import matplotlib.pyplot as plt

            xdata = range(len(merged_scores))
            plt.semilogy(xdata, merged_scores)

            plt.xlabel("emission idx")
            plt.ylabel("prior")
            plt.grid(True)
            plt.savefig(self.out_prior_plot.get_path())
示例#30
0
 def _get_rec_to_segs_map(self):
     """
     Returns recording to list of segments mapping
     """
     rec_to_segs = defaultdict(list)
     for trans_file in glob.glob(
             os.path.join(self.trans_dir.get_path(), "*/*/*-trans.text")):
         with uopen(trans_file, "rt") as f:
             for line in f:
                 seg_info = line.strip().split(" ",
                                               3)  # name start end orth
                 assert len(seg_info) == 4
                 rec_name = (seg_info[0].split("-")[0].replace("sw", "sw0")
                             )  # e.g: sw2001A-ms98-a-0022 -> sw02001A
                 rec_to_segs[rec_name].append(seg_info)
     return rec_to_segs