示例#1
0
    def read_walign(self, filename):
        """
        Read an alignment file in the standard format of Julius CSR engine.

        @param filename (str - IN) The input file name.
        @return Two lists of tuples:
            - None
            - (start-time end-time word score)

        """
        tokens = [""]
        scores = [0]
        _wordalign = []
        wordidx = -1
        with codecs.open(filename, "r", encoding) as fp:
            lines = fp.readlines()

        for line in lines:
            # Each line is either a new annotation or nothing interesting!
            line = ToStrip(line)

            if line.startswith("=== begin forced alignment ==="):
                wordidx = 0

            elif line.startswith("=== end forced alignment ==="):
                wordidx = -1

            elif line.startswith("wseq1:"):
                line = line[6:]
                # each token
                tokens = line.split()
                if len(tokens) == 0:
                    tokens = [""]

            elif line.startswith("cmscore1:"):
                line = line[9:]
                # confidence score of the pronunciation of each token
                scores = [float(s) for s in line.split()]
                if len(scores) == 0:
                    scores = [0]

            elif line.startswith("[") and wordidx > -1:
                # New phonemes
                line = line.replace("[", "")
                line = line.replace("]", "")
                line = ToStrip(line)
                tab = line.split(" ")
                # tab 0: first frame
                # tab 1: last frame
                # tab 2: score of the segmentation (log proba)
                # tab 3: word
                loc_s = float(tab[0]) / 100.0
                loc_e = float(tab[1]) / 100.0
                _wordalign.append([loc_s, loc_e, tokens[wordidx], scores[wordidx]])
                wordidx = wordidx + 1

        # Adjust time values
        for wordidx in range(len(_wordalign)):

            # Fix the end of this annotation to the begin of the next one.
            loc_e = _wordalign[wordidx][1]
            if wordidx < (len(_wordalign) - 1):
                nextloc_s = _wordalign[wordidx + 1][0]
            else:
                nextloc_s = 0.0
            if loc_e < nextloc_s:
                loc_e = nextloc_s
            _wordalign[wordidx][1] = loc_e

        return (None, _wordalign)
示例#2
0
    def read_palign(self, filename):
        """
        Read an alignment file in the standard format of Julius CSR engine.

        @param filename (str - IN) The input file name.
        @return Two lists of tuples:
            - (start-time end-time phoneme score)
            - (start-time end-time word score)

        """
        _phonalign = []
        _wordalign = []

        phonidx = -1  # phoneme index
        loc_s = 0.0  # phoneme start time
        loc_e = 0.0  # phoneme end time
        phonlist = []
        wordseq = []
        scores = [0]
        tokens = [""]

        with codecs.open(filename, "r", encoding) as fp:
            lines = fp.readlines()

        for line in lines:
            # Each line is either a new annotation or nothing interesting!
            line = ToStrip(line)

            if line.startswith("=== begin forced alignment ==="):
                phonidx = 0

            elif line.startswith("=== end forced alignment ==="):
                phonidx = -1

            elif line.startswith("phseq1:"):
                line = line[7:]
                line = ToStrip(line)

                wordseq = line.split("|")
                # get indexes of each word
                wordlist = []
                _idx = -1
                for w in wordseq:
                    _wrdphseq = w.strip().split()
                    _idx += len(_wrdphseq)
                    wordlist.append(_idx)
                # get the list of phonemes (without word segmentation)
                line = line.replace("|", "")
                line = ToStrip(line)
                phonlist = line.split()

            elif line.startswith("cmscore1:"):
                line = line[9:]
                # confidence score of the pronunciation of each token
                scores = [float(s) for s in line.split()]
                if len(scores) == 0:
                    scores = [0]

            elif line.startswith("sentence1:"):
                line = line[10:]
                # each token
                tokens = line.split()
                if len(tokens) == 0:
                    tokens = [""]

            elif line.startswith("[") and phonidx > -1:
                # New phonemes
                line = line.replace("[", "")
                line = line.replace("]", "")
                line = ToStrip(line)
                tab = line.split(" ")
                # tab 0: first frame
                # tab 1: last frame
                # tab 2: score of the segmentation (log proba)
                # tab 3: triphone used
                loc_s = float(tab[0]) / 100.0
                loc_e = float(tab[1]) / 100.0
                if len(tab) > 3:
                    # Put real phoneme instead of triphones
                    _phonalign.append([loc_s, loc_e, phonlist[phonidx], tab[2]])
                else:
                    _phonalign.append([loc_s, loc_e, "", tab[2]])
                phonidx = phonidx + 1

        # Adjust time values and create wordalign
        wordidx = 0  # word index
        wordloc_s = 0.0  # word start time
        loc_e = 0.0
        nextloc_s = 0.0
        for phonidx in range(len(_phonalign)):

            # Fix the end of this annotation to the begin of the next one.
            loc_e = _phonalign[phonidx][1]
            if phonidx < (len(_phonalign) - 1):
                nextloc_s = _phonalign[phonidx + 1][0]
            else:
                nextloc_s = 0.0
            if loc_e < nextloc_s:
                loc_e = nextloc_s
            _phonalign[phonidx][1] = loc_e

            # Override the segmentation score of the phone by
            # the score of the pronunciation of the word
            _phonalign[phonidx][3] = scores[wordidx]

            # add also the word?
            if phonidx == wordlist[wordidx]:
                _wordalign.append([wordloc_s, loc_e, tokens[wordidx], scores[wordidx]])
                wordidx = wordidx + 1
                wordloc_s = loc_e

        # last word, or the only entry in case of empty interval...
        if len(wordseq) - 1 == wordidx:
            _wordalign.append([wordloc_s, loc_e, tokens[wordidx - 1], scores[wordidx - 1]])

        return (_phonalign, _wordalign)