예제 #1
0
 def setUp(self):
     self.dd = sppasDictPron()
     self.grph = sppasDictPhonetizer(self.dd)
     self.dd.add_pron("a", "a")
     self.dd.add_pron("b", "b")
     self.dd.add_pron("c", "c")
     self.dd.add_pron(SP_ORTHO, SP)
예제 #2
0
    def gen_slm_dependencies(self, basename, N=3):
        """Generate the dependencies (slm, dictionary) for julius.

        :param basename: (str) base name of the slm and dictionary files
        :param N: (int) Language model N-gram length.

        """
        dict_name = basename + ".dict"
        slm_name = basename + ".arpa"

        phoneslist = self._phones.split()
        tokenslist = self._tokens.split()

        dictpron = sppasDictPron()

        for token, pron in zip(tokenslist, phoneslist):
            for variant in pron.split("|"):
                dictpron.add_pron(token, variant.replace("-", " "))

        if dictpron.is_unk(START_SENT_SYMBOL) is True:
            dictpron.add_pron(START_SENT_SYMBOL, SIL_PHON)
        if dictpron.is_unk(END_SENT_SYMBOL) is True:
            dictpron.add_pron(END_SENT_SYMBOL, SIL_PHON)

        dictpron.save_as_ascii(dict_name, False)

        # Write the SLM
        model = sppasNgramsModel(N)
        model.append_sentences([self._tokens])
        probas = model.probabilities(method="logml")
        arpaio = sppasArpaIO()
        arpaio.set(probas)
        arpaio.save(slm_name)
예제 #3
0
    def gen_slm_dependencies(self, basename, N=3):
        """ Generate the dependencies (slm, dictionary) for julius.

        :param basename: (str) the base name of the slm file and of the dictionary file
        :param N: (int) Language model N-gram length.

        """
        dictname = basename + ".dict"
        slmname  = basename + ".arpa"

        phoneslist = self._phones.split()
        tokenslist = self._tokens.split()

        dictpron = sppasDictPron()

        for token, pron in zip(tokenslist, phoneslist):
            for variant in pron.split("|"):
                dictpron.add_pron(token, variant.replace("-", " "))

        if dictpron.is_unk(START_SENT_SYMBOL) is True:
            dictpron.add_pron(START_SENT_SYMBOL, "sil")
        if dictpron.is_unk(END_SENT_SYMBOL) is True:
            dictpron.add_pron( END_SENT_SYMBOL, "sil")

        dictpron.save_as_ascii(dictname, False)

        # Write the SLM
        model = sppasNgramsModel(N)
        model.append_sentences([self._tokens])
        probas = model.probabilities(method="logml")
        arpaio = sppasArpaIO()
        arpaio.set(probas)
        arpaio.save(slmname)
예제 #4
0
 def setUp(self):
     self.dd = sppasDictPron()
     self.grph = sppasDictPhonetizer(self.dd)
     self.dd.add_pron("a", "a")
     self.dd.add_pron("b", "b")
     self.dd.add_pron("c", "c")
     self.dd.add_pron("+", "sp")
예제 #5
0
    def set_dict(self, dict_filename):
        """ Set the pronunciation dictionary.

        :param dict_filename: (str) The pronunciation dictionary in HTK-ASCII
        format with UTF-8 encoding.

        """
        pdict = sppasDictPron(dict_filename, nodump=False)
        self.phonetizer = sppasDictPhonetizer(pdict, self.maptable)
예제 #6
0
파일: sppasphon.py 프로젝트: lym0302/sppas
    def set_dict(self, dict_filename):
        """ Set the pronunciation dictionary.

        :param dict_filename: (str) The pronunciation dictionary in HTK-ASCII
        format with UTF-8 encoding.

        """
        pdict = sppasDictPron(dict_filename, nodump=False)
        self.phonetizer = sppasDictPhonetizer(pdict, self.maptable)
예제 #7
0
    def add_from_dict(self, dict_filename):
        """Add the list of phones from a pronunciation dictionary.

        :param dict_filename: (str) Name of an HTK-ASCII pronunciation dict

        """
        d = sppasDictPron(dict_filename)
        for key in d:
            value = d.get_pron(key)
            variants = value.split(separators.variants)
            for variant in variants:
                phones = variant.split(separators.phonemes)
                for phone in phones:
                    self.add(phone)
예제 #8
0
파일: phoneset.py 프로젝트: lym0302/sppas
    def add_from_dict(self, dict_filename):
        """ Add the list of phones from a pronunciation dictionary.

        :param dict_filename: (str) Name of an HTK-ASCII pronunciation dictionary

        """
        d = sppasDictPron(dict_filename)
        for key in d:
            value = d.get_pron(key)
            variants = value.split(VARIANTS_SEPARATOR)
            for variant in variants:
                phones = variant.split(PHONEMES_SEPARATOR)
                for phone in phones:
                    self.add(phone)
예제 #9
0
    def add_from_dict(self, dict_filename):
        """ Add the list of phones from a pronunciation dictionary.

        :param dict_filename: (str) Name of an HTK-ASCII pronunciation dictionary

        """
        d = sppasDictPron(dict_filename)
        for key in d:
            value = d.get_pron(key)
            variants = value.split(sppasDictPron.VARIANTS_SEPARATOR)
            for variant in variants:
                phones = variant.split(sppasDictPron.PHONEMES_SEPARATOR)
                for phone in phones:
                    self.add(phone)
예제 #10
0
    def test_phonetizeFR(self):
        dictdir = os.path.join(SPPAS, "resources", "dict")
        dictfile = os.path.join(dictdir, "fra.dict")
        dd = sppasDictPron(dictfile)
        grph = sppasDictPhonetizer(dd)
        result = grph.phonetize('pas_encore', phonunk=False)
        self.assertEqual(result, 'UNK')

        result = grph.phonetize('pas_encore', phonunk=True)
        self.assertEqual(result,
                         "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu")

        result = grph.phonetize(u'/lemot/', phonunk=True)
        self.assertEqual(result, u"lemot")
        result = grph.phonetize(u'/lemot/', phonunk=False)
        self.assertEqual(result, u"lemot")
예제 #11
0
    def test_phonetizeFR(self):
        dictdir = os.path.join(SPPAS, "resources", "dict")
        dictfile = os.path.join(dictdir, "fra.dict")
        dd = sppasDictPron(dictfile)
        grph = sppasDictPhonetizer(dd)
        result = grph.phonetize('pas_encore', phonunk=False)
        self.assertEqual(result, 'UNK')

        result = grph.phonetize('pas_encore', phonunk=True)
        self.assertEqual(
            result,
            "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu")

        result = grph.phonetize(u'/lemot/', phonunk=True)
        self.assertEqual(result, u"lemot")
        result = grph.phonetize(u'/lemot/', phonunk=False)
        self.assertEqual(result, u"lemot")
예제 #12
0
    def test_data(self):
        dictfile = os.path.join(RESOURCES_PATH, "dict", "eng.dict")
        map_table = os.path.join(RESOURCES_PATH, "dict", "eng-fra.map")
        mapt = sppasMapping(map_table)
        dd = sppasDictPron(dictfile)
        grph = sppasDictPhonetizer(dd)
        self.assertEqual(grph.get_phon_entry("THE"), "D-@|D-V|D-i:")
        self.assertEqual(grph.get_phon_entry("UR"), "3:r|U-r\\")
        self.assertEqual(grph.get_phon_entry("ARE"), "A-r\|3:r")
        self.assertEqual(grph.get_phon_entry("BANC"), "b-{-N-k")

        grph.set_maptable(mapt)
        the = "z-@|D-@|v-@|v-V|D-V|z-V|z-9|D-9|v-9|z-i:|z-i|D-i|v-i|D-i:|v-i:"
        ur = "3:r|9-R|u-r\|U-w|u-w|U-R|U-r\|u-R"
        are = "a-R|A-R|a-w|A-w|a-r\|A-r\|3:r|9-R"
        self.assertEqual(set(grph.get_phon_entry("THE").split("|")), set(the.split("|")))
        self.assertEqual(set(grph.get_phon_entry("UR").split("|")), set(ur.split("|")))
        self.assertEqual(set(grph.get_phon_entry("ARE").split("|")), set(are.split("|")))
예제 #13
0
    def test_phon_from_loaded_data(self):
        """... Phonetization using real resource data."""

        dict_file = os.path.join(paths.resources, "dict", "eng.dict")
        map_table = os.path.join(paths.resources, "dict", "eng-fra.map")
        mapt = sppasMapping(map_table)
        dd = sppasDictPron(dict_file)
        grph = sppasDictPhonetizer(dd)

        self.assertEqual(set("D-@|D-V|D-i:".split('|')),
                         set(grph.get_phon_entry("THE").split('|')))

        self.assertEqual(set("3:r|U-r\\".split('|')),
                         set(grph.get_phon_entry("UR").split('|')))

        self.assertEqual(set("A-r\\|3:r".split('|')),
                         set(grph.get_phon_entry("ARE").split('|')))

        self.assertEqual(set("b-{-N-k".split('|')),
                         set(grph.get_phon_entry("BANC").split('|')))

        grph.set_maptable(mapt)
        grph.set_unk_variants(0)
        # DICT:   the [] D @   /    the(2) [] D V    /    the(3) [] D i:
        # MAP:    D z   /   i: i    /    V 9    /   V @
        self.assertEqual(
            set("D-@|D-V|D-i:|z-@|z-V|z-i:|D-i|z-i|D-9|z-9|z-@".split("|")),
            set(grph.get_phon_entry("THE").split("|")))

        # DICT:  ur [] 3:r   /   ur(2) [] U r\
        # MAP:   3:r 9-R   /  U u   /    r\ R   /   r\ w
        self.assertEqual(set("3:r|U-r\\|9-R|u-r\\|U-R|U-w|u-R|u-w".split("|")),
                         set(grph.get_phon_entry("UR").split("|")))

        # DICT =   are [] A r\    /    are(2) [] 3:r
        # MAP:  r\ R   /   r\ w    /   3:r 9-R    / A a
        self.assertEqual(set("A-r\\|3:r|a-r\\|9-R|A-R|A-w|a-R|a-w".split("|")),
                         set(grph.get_phon_entry("ARE").split("|")))
예제 #14
0
    def gen_dependencies(self, grammarname, dictname):
        """ Generate the dependencies (grammar, dictionary) for HVite.

        :param grammarname: (str) the file name of the tokens
        :param dictname: (str) the dictionary file name

        """
        dictpron = sppasDictPron()

        with codecs.open(grammarname, 'w', encoding) as flab:

            for token, pron in zip(self._tokens.split(), self._phones.split()):

                # dictionary:
                for variant in pron.split("|"):
                    dictpron.add_pron(token, variant.replace("-", " "))
                    if self._infersp is True:
                        variant = variant + '-sil'
                        dictpron.add_pron(token, variant.replace("-", " "))

                # lab file (one token per line)
                flab.write(token+"\n")

        dictpron.save_as_ascii(dictname)
예제 #15
0
    def gen_dependencies(self, grammar_name, dict_name):
        """Generate the dependencies (grammar, dictionary) for HVite.

        :param grammar_name: (str) the file name of the tokens
        :param dict_name: (str) the dictionary file name

        """
        dictpron = sppasDictPron()

        with codecs.open(grammar_name, 'w', sg.__encoding__) as flab:

            for token, pron in zip(self._tokens.split(), self._phones.split()):

                # dictionary:
                for variant in pron.split("|"):
                    dictpron.add_pron(token, variant.replace("-", " "))
                    # if self._infersp is True:
                    #     variant = variant + '-sil'
                    #     dictpron.add_pron(token, variant.replace("-", " "))

                # lab file (one token per line)
                flab.write(token + "\n")

        dictpron.save_as_ascii(dict_name)
예제 #16
0
                    help='Input dictionary file name (as many as wanted)')

parser.add_argument("--quiet",
                    action='store_true',
                    help="Disable the verbosity")

if len(sys.argv) <= 1:
    sys.argv.append('-h')

args = parser.parse_args()

# ----------------------------------------------------------------------------

args = parser.parse_args()

pron_dict = sppasDictPron(args.i, nodump=True)

for entry in pron_dict:

    prons = pron_dict.get_pron(entry)
    nb_chars = float(len(entry))

    for pron in prons.split(sppasDictPron.VARIANTS_SEPARATOR):

        phonetization = pron.split(sppasDictPron.PHONEMES_SEPARATOR)
        nb_phones = float(len(phonetization))

        if nb_phones < nb_chars * 0.5:
            print("{:s}\t{:s}\tsmall".format(entry.encode('utf8'), pron.encode('utf8')))

        elif nb_phones > nb_chars * 1.8:
예제 #17
0
    if not args.quiet:
        log_level = cg.log_level
    else:
        log_level = cg.quiet_log_level
    lgs = sppasLogSetup(log_level)
    lgs.stream_handler()

# ----------------------------------------------------------------------------

with_variant_nb = True
with_filled_brackets = True
if args.no_variant_numbers:
    with_variant_nb = False
if args.no_filled_brackets:
    with_filled_brackets = False
merge_dict = sppasDictPron()

# ----------------------------------------------------------------------------

args = parser.parse_args()
for dict_file in args.i:

    if not args.quiet:
        print("Read input dictionary file: ")
    pron_dict = sppasDictPron(dict_file, nodump=True)
    if not args.quiet:
        print(" [  OK  ]")

    for entry in pron_dict:
        prons = pron_dict.get_pron(entry)
        for pron in prons.split(separators.variants):
예제 #18
0
파일: phonetize.py 프로젝트: lym0302/sppas
if not args.quiet:
    setup_logging(0, None)
else:
    setup_logging(30, None)

# ----------------------------------------------------------------------------
# Automatic Phonetization is here:
# ----------------------------------------------------------------------------

unkopt = True
if args.nounk:
    unkopt = False

mapfile = None
if args.map:
    mapfile = args.map

if args.i:
    p = sppasPhon(args.dict, mapfile)
    p.set_unk(unkopt)
    p.set_usestdtokens(False)
    p.run(args.i, args.o)
else:
    pdict = sppasDictPron(args.dict, nodump=False)
    maptable = sppasMapping()
    if mapfile is not None:
        maptable = sppasMapping(mapfile)
    phonetizer = sppasDictPhonetizer(pdict, maptable)
    for line in sys.stdin:
        print("{:s}".format(phonetizer.phonetize(line, unkopt)))
예제 #19
0
파일: dictcheck.py 프로젝트: lym0302/sppas
                    help='Input dictionary file name (as many as wanted)')

parser.add_argument("--quiet",
                    action='store_true',
                    help="Disable the verbosity")

if len(sys.argv) <= 1:
    sys.argv.append('-h')

args = parser.parse_args()

# ----------------------------------------------------------------------------

args = parser.parse_args()

pron_dict = sppasDictPron(args.i, nodump=True)

for entry in pron_dict:

    prons = pron_dict.get_pron(entry)
    nb_chars = float(len(entry))

    for pron in prons.split(VARIANTS_SEPARATOR):

        phonetization = pron.split(PHONEMES_SEPARATOR)
        nb_phones = float(len(phonetization))

        if nb_phones < nb_chars * 0.5:
            print("{:s}\t{:s}\tsmall".format(entry.encode('utf8'),
                                             pron.encode('utf8')))
예제 #20
0
                    help="Disable the verbosity")

if len(sys.argv) <= 1:
    sys.argv.append('-h')

args = parser.parse_args()

# ----------------------------------------------------------------------------

with_variant_nb = True
with_filled_brackets = True
if args.no_variant_numbers:
    with_variant_nb = False
if args.no_filled_brackets:
    with_filled_brackets = False
merge_dict = sppasDictPron()

# ----------------------------------------------------------------------------

args = parser.parse_args()
for dict_file in args.i:

    if not args.quiet:
        print("Read input dictionary file: ")
    pron_dict = sppasDictPron(dict_file, nodump=True)
    if not args.quiet:
        print(" [  OK  ]")

    for entry in pron_dict:
        prons = pron_dict.get_pron(entry)
        for pron in prons.split(sppasDictPron.VARIANTS_SEPARATOR):
예제 #21
0
# ----------------------------------------------------------------------------

if not args.quiet:
    setup_logging(1, None)

# ----------------------------------------------------------------------------
# Automatic Phonetization is here:
# ----------------------------------------------------------------------------

unkopt = True
if args.nounk:
    unkopt = False

mapfile = None
if args.map:
    mapfile = args.map

if args.i:
    p = sppasPhon(args.dict, mapfile)
    p.set_unk(unkopt)
    p.set_usestdtokens(False)
    p.run(args.i, args.o)
else:
    pdict = sppasDictPron(args.dict, nodump=False)
    maptable = sppasMapping()
    if mapfile is not None:
        maptable = sppasMapping(mapfile)
    phonetizer = sppasDictPhonetizer(pdict, maptable)
    for line in sys.stdin:
        print("{:s}".format(phonetizer.phonetize(line, unkopt)))