Пример #1
0
    def _check_phones(self):
        # TODO check xsampa compatibility and/or compatibility
        # with articulatory features databases of IPA or just basic
        # IPA correctness
        self.log.debug('checking phones')
        phones = self.corpus.phones.keys()
        ipas = self.corpus.phones.values()

        if len(phones) == 0:
            raise IOError('The phones inventory is empty')

        if u'SIL' in phones:
            raise IOError("'SIL' symbol is reserved for indicating "
                          "optional silence, it cannot be used in phones")

        if u'SPN' in phones:
            raise IOError("'SPN' symbol is reserved for indicating "
                          "vocal noise, it cannot be used in phones")

        _duplicates = duplicates(phones)
        if _duplicates:
            raise IOError(
                "following phone symbols are used several times in phones: {}".
                format(_duplicates))

        _duplicates = duplicates(ipas)
        if _duplicates:
            raise IOError(
                "following IPA symbols are used several times in phones: {}".
                format(_duplicates))

        self._check_position_dependent_phones(phones)
        return phones
Пример #2
0
    def validate_transcription(self):
        """Checking transcriptions"""
        self.log.debug("checking transcriptions")

        utt_ids = sorted(self.corpus.segments.keys())
        utt_ids_txt = sorted(self.corpus.text.keys())

        # we will check that the words are mostly in the lexicon later
        # same utterance-ids in segments and text
        if utt_ids_txt != utt_ids:
            _duplicates = duplicates(utt_ids_txt)
            if _duplicates:
                self.log.debug(
                    "utterance-ids used several times in text: {}".format(
                        len(_duplicates)))

                raise IOError(
                    "utterance-ids used several times in text: {}".format(
                        _duplicates))
            else:
                e_txt = set(utt_ids_txt)
                e_seg = set(utt_ids)

                self.log.debug(
                    "utterances in text but not in segments: {}".format(
                        set.difference(e_txt, e_seg)))

                self.log.debug(
                    "utterances in segments but not in text: {}".format(
                        set.difference(e_seg, e_txt)))

                raise IOError(
                    "utterance-ids in segments and text are not consistent")
Пример #3
0
    def _check_silences(self, phones):
        self.log.debug('checking silences')
        sils = self.corpus.silences

        self._check_empty_entry(sils)

        _duplicates = duplicates(sils)
        if _duplicates:
            raise IOError(
                "following symbols are used several times in silences: {}"
                .format(_duplicates))

        if u"SIL" not in sils:
            self.log.debug("adding missing 'SIL' symbol to silences")
            sils.append('SIL')

        if u"SPN" not in sils:
            self.log.debug("adding missing 'SPN' symbol to silences")
            sils.append('SPN')

        inter = set.intersection(set(sils), set(phones))
        if inter:
            raise IOError(
                "The following symbols are used in both phones "
                "and silences: {}".format(inter))

        return sils
Пример #4
0
    def validate_segments(self, meta):
        """Checking utterances list in segments"""
        self.log.debug("checking segments")
        segments = self.corpus.segments
        utt_ids = segments.keys()
        utt_wavs = [w[0] for w in segments.values()]
        starts = [w[1] for w in segments.values()]
        stops = [w[2] for w in segments.values()]

        # wav extension in segments
        _no_wavs_extension = [w for w in utt_wavs if not w.endswith('.wav')]
        if _no_wavs_extension:
            raise IOError(
                'There is wav-ids in segmetns without .wav extension: {}'
                .format(resume_list(_no_wavs_extension)))

        # unique utterance-ids
        _duplicates = duplicates(utt_ids)
        if _duplicates:
            raise IOError(
                "There is utterance-ids in segments used several times: {}"
                .format(_duplicates))

        # all referenced wavs are in wav folder
        ref_wavs = set(utt_wavs)
        missing_wavefiles = set.difference(ref_wavs, self.corpus.wavs)
        if missing_wavefiles:
            raise IOError(
                "The following wavefiles are referenced "
                "in segments but are not in wavs {}"
                .format(missing_wavefiles))

        if(len(ref_wavs) == len(utt_wavs) and
           all([e is None for e in starts]) and
           all([e is None for e in stops])):
            # simple case, with one utterance per file and no explicit
            # timestamps provided just get list of files that are very
            # short (less than 0.1s)
            short_wavs = [utt_id for utt_id, w in zip(utt_ids, utt_wavs)
                          if meta[w].duration < self.wav_min_duration]
        else:
            # more complicated case :find all utterances (plus
            # timestamps) associated to each wavefile and for each
            # wavefile, check consistency of the timestamps of all
            # utterances inside it
            warning, short_wavs = self._check_timestamps(meta)
            if warning:
                self.log.warning(
                    "Some utterances are overlapping in time, "
                    "see details in log file")

        if short_wavs:
            self.log.debug(
                "The following utterances are less than 100 ms long and "
                "won't be used in kaldi recipes: {}"
                .format(resume_list(short_wavs)))
Пример #5
0
    def validate_speakers(self):
        """Checking speakers from corpus.utt2spk"""
        self.log.debug("checking speakers")

        utt_ids_spk = self.corpus.utt2spk.keys()
        speakers = self.corpus.utt2spk.values()
        utt_ids = self.corpus.segments.keys()

        # same utterance-ids in segments and utt2spk
        if sorted(utt_ids_spk) != sorted(utt_ids):
            _duplicates = duplicates(utt_ids_spk)
            if _duplicates:
                raise IOError(
                    "The following utterance-ids are used several times "
                    "in utt2spk: {}".format(_duplicates))
            else:
                e_spk = set(utt_ids_spk)
                e_seg = set(utt_ids)

                self.log.debug(
                    "Utterances in utt2spk that are not in segments: {}"
                    .format(set.difference(e_spk, e_seg)))

                self.log.debug(
                    "Utterances in segments that are not in utt2spk: {}"
                    .format(set.difference(e_seg, e_spk)))

                raise IOError(
                    "Utterance-ids in segments and utt2spk are "
                    "not consistent, see details in log")

        # speaker ids must have a fixed length
        default_len = len(list(speakers)[0])
        if not all([len(s) == default_len for s in speakers]):
            self.log.debug(
                "Speaker-ids length observed in utt2spk with associated "
                "frequencies: {0}".format(
                    collections.Counter([len(s) for s in speakers])))

            raise IOError(
                "All speaker-ids must have the same length.")

        # each speaker id must be prefix of corresponding utterance-id
        for utt, spk in zip(utt_ids, speakers):
            if not utt[:default_len] == spk:
                raise IOError(
                    "All utterance-ids must be prefixed by the "
                    "corresponding speaker-id")
Пример #6
0
    def _check_variants(self, phones, sils):
        self.log.debug('checking variants')
        variants = self.corpus.variants

        all_symbols = [symbol for group in variants for symbol in group]
        unknown_symbols = [
            symbol for symbol in all_symbols
            if symbol not in phones and symbol not in sils
        ]
        if unknown_symbols:
            raise IOError("The following symbols are present in variants, "
                          "but are neither in phones nor in silences: "
                          "{}".format(unknown_symbols))

            _duplicates = duplicates(all_symbols)
            if _duplicates:
                raise IOError("The following symbols are used several times "
                              "in variants: {}".format(_duplicates))

        return set.union(set(phones), set(sils))
Пример #7
0
    def validate_lexicon(self, inventory):
        self.log.debug("checking lexicon")

        dict_words = self.corpus.lexicon.keys()
        transcriptions = [t.split() for t in self.corpus.lexicon.values()]

        # checks all words have a non empty transcription
        empties = {
            k: v
            for k, v in self.corpus.lexicon.iteritems() if v.strip() == ''
        }
        if empties:
            raise IOError(
                'the following words have no transcription in lexicon: {}'.
                format(empties.keys()))

        # unique word entries (alternative pronunciations are not
        # currently supported)
        _duplicates = duplicates(dict_words)
        if _duplicates:
            raise IOError(
                "Alternative pronunciations are not currently supported. "
                "Following words have several transcriptions in lexicon: {}".
                format(_duplicates))

        # OOV item
        if u"<unk>" not in dict_words:
            self.log.debug("adding '<unk>' word to lexicon")
            dict_words.append("<unk>")
            transcriptions.append(["SPN"])
            self.corpus.lexicon['<unk>'] = 'SPN'
        else:
            if transcriptions[dict_words.index("<unk>")] != ["SPN"]:
                raise IOError("'<unk>' word is reserved for mapping "
                              "OOV items and should always be transcribed "
                              "as 'SPN' (vocal) noise'")
        # TODO should we log a warning for all words containing silence phones?

        # unused words
        used_words = [
            word for utt in self.corpus.text.itervalues()
            for word in utt.split()
        ]
        dict_words_set = set(dict_words)
        used_word_types = set(used_words)
        used_word_counts = collections.Counter(used_words)
        used_dict_words = set.intersection(dict_words_set, used_word_types)
        self.log.debug("{} dictionary words used out of {}".format(
            len(used_dict_words), len(dict_words_set)))

        # oov words
        oov_word_types = set.difference(used_word_types, dict_words_set)
        oov_word_counts = collections.Counter(
            {oov: used_word_counts[oov]
             for oov in oov_word_types})
        nb_oov_tokens = sum(oov_word_counts.values())
        nb_oov_types = len(oov_word_types)

        self.log.debug(
            u"{} OOV word types in transcriptions out of {} types in total".
            format(nb_oov_types, len(used_word_types)))

        self.log.debug(
            u"{} OOV word tokens in transcriptions out of {} tokens in total".
            format(nb_oov_tokens, len(used_words)))

        self.log.debug(
            u"list of OOV word types with occurences counts: {0}".format(
                self._strcounts2unicode(oov_word_counts.most_common())))

        # raise alarm if the proportion of oov words is too large
        # either in terms of types or tokens
        oov_proportion_types = nb_oov_types / float(len(used_word_types))
        self.log.debug(
            "Proportion of oov word types: {}".format(oov_proportion_types))
        if oov_proportion_types > 0.1:
            self.log.warning('More than 10 percent of word '
                             'types used are Out-Of-Vocabulary items!')

        oov_proportion_tokens = nb_oov_tokens / float(len(used_words))
        self.log.debug(
            "Proportion of oov word tokens: {}".format(oov_proportion_tokens))
        if oov_proportion_tokens > 0.1:
            self.log.warning('More than 10 percent of word '
                             'tokens used are Out-Of-Vocabulary items!')

        # homophones (issue warnings only)
        counts = collections.Counter(
            [u" ".join(phone_trans) for phone_trans in transcriptions])

        duplicate_transcripts = collections.Counter(
            {trans: counts[trans]
             for trans in counts if counts[trans] > 1})

        if duplicate_transcripts:
            self.log.warning(
                "There are homophones in the pronunciation dictionary")

            self.log.debug(
                'There are %s phone sequences that correspond to several words'
                ' in the pronunciation dictionary', len(duplicate_transcripts))

            self.log.debug(
                'There are %s word types with homophones in the pronunciation '
                'dictionary', sum(duplicate_transcripts.values()))

            self.log.debug(
                "List of homophonic phone sequences in lexicon "
                "with number of corresponding word types: %s",
                resume_list(duplicate_transcripts.most_common()))

            # # Commented because it takes a lot of times for certain corpora
            # # Maybe put it as an option
            # # get word types:
            # #    - found in transcriptions
            # #    - with at least one homophonic word type also found
            # #      in transcriptions
            # homophonic_sequences = duplicate_transcripts.keys()
            # homophony_groups = {}
            # for homo_transcript in homophonic_sequences:
            #     homo_group = [word for word, transcript
            #                   in zip(dict_words, str_transcripts)
            #                   if transcript == homo_transcript
            #                   and word in used_word_types]
            #     if len(homo_group) > 1:
            #         homophony_groups[homo_transcript] = homo_group

            # nb_homo_types = sum([len(v) for v in homophony_groups.values()])
            # self.log.warning(
            #     "{0} word types found in transcriptions with "
            #     "at least one homophone also found in transcriptions "
            #     "out of {1} word types in total").format(
            #         nb_homo_types, len(used_word_types))

            # nb_homo_tokens = sum([
            #     sum([used_word_counts[word] for word in group])
            #     for group in homophony_groups.values()])

            # self.log.warning(
            #     "{0} corresponding word tokens out of {1} total".format(
            #         nb_homo_tokens, len(used_words)))

            # l = [", ".join([
            #     word + u": " + unicode(used_word_counts[word])
            #     for word in group]) for group in homophony_groups.values()]

            # self.log.warning(
            #     "List of groups of homophonic word types "
            #     "(including only types actually found in transcriptions) "
            #     "with number of occurences of each member of each group:\n{}"
            #     .format(resume_list(l)))

        # ooi phones
        used_phones = [
            phone for trans_phones in transcriptions for phone in trans_phones
        ]

        ooi_phones = [
            phone for phone in set(used_phones) if phone not in inventory
        ]

        if ooi_phones:
            raise IOError(
                u"phonetic dictionary uses out-of-inventory phones: {0}".
                format(ooi_phones))

        # warning for unused phones
        unused_phones = set.difference(inventory, used_phones)
        if unused_phones:
            self.log.debug("The following phones are never found "
                           "in the transcriptions: {}".format(unused_phones))