Пример #1
0
def main(options, args):
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=stdout)
        if not model:
            return 1
        if options.testSample or options.applySample:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options)
        translator.reportStats(sys.stdout)

    if options.applySample:
        mainApply(translator, options)
        translator.reportStats(sys.stderr)
Пример #2
0
def g2pMain(options, args):
    import locale
    loadSample = loadG2PSample

    enc = locale.getpreferredencoding()
    if hasattr(sys.stdout, 'buffer'):
        log_stdout = codecs.getwriter(enc)(sys.stdout.buffer,
                                           errors='backslashreplace')
    else:
        log_stdout = codecs.getwriter(enc)(sys.stdout,
                                           errors='backslashreplace')

    if hasattr(sys.stderr, 'buffer'):
        log_stderr = codecs.getwriter(enc)(sys.stderr.buffer,
                                           errors='backslashreplace')
    else:
        log_stderr = codecs.getwriter(enc)(sys.stderr,
                                           errors='backslashreplace')

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=log_stdout)
        if not model:
            return 1
        if options.testSample or options.applySample or options.applyWord:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.applyWord:
        return g2pApplyWord(translator, options, log_stdout)
Пример #3
0
    def __init__(self, dict_path=__dict_path__, model_path=__model_path__):
        self._dict_ = dict()
        dict_path = os.path.expanduser(dict_path)
        model_path = os.path.expanduser(model_path)
        self.__dict_path__ = dict_path
        self.__model_path__ = model_path

        sequitur_options = Values()
        sequitur_options.resume_from_checkpoint = False
        sequitur_options.modelFile = model_path
        sequitur_options.shouldRampUp = False
        sequitur_options.trainSample = False
        sequitur_options.shouldTranspose = False
        sequitur_options.newModelFile = False
        sequitur_options.shouldSelfTest = False
        self.__model__ = SequiturTool.procureModel(sequitur_options, None)
        if not self.__model__:
            logger.error('Can\'t load g2p model.')
            return None
        self.__model__ = Translator(self.__model__)

        a = open(dict_path).readlines()
        a = [i.strip('\n') for i in a]
        for i in a:
            i = i.split(' ')
            self._dict_[i[0]] = i[1:]
Пример #4
0
def transliterate(model, word):

  class Struct:
      def __init__(self, **entries):
          self.__dict__.update(entries)

  model_path = {
    'pythainlp_lexicon': './lib/model-7', 
    'wiktionary_phonemic': './lib/tha-pt-b-7'
  }

  connector_dict = {
    'pythainlp_lexicon': '', 
    'wiktionary_phonemic': '-'
  }


  modelFile = model_path[model]
  connector = connector_dict[model]

  options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})

  loadSample = g2p.loadG2PSample

  model = SequiturTool.procureModel(options, loadSample)
  if not model:
      return 1
  translator = g2p.Translator(model)
  del model

  return connector.join(translator(tuple(word)))
Пример #5
0
def main(options, args):
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=stdout)
        if not model:
            return 1
        if options.testSample or options.applySample:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options)
        translator.reportStats(sys.stdout)

    if options.applySample:
        mainApply(translator, options)
        translator.reportStats(sys.stderr)
Пример #6
0
def main(options, args):
    import locale
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    enc = locale.getpreferredencoding()
    if hasattr(sys.stdout, 'buffer'):
        log_stdout = codecs.getwriter(enc)(sys.stdout.buffer,
                                           errors='backslashreplace')
    else:
        log_stdout = codecs.getwriter(enc)(sys.stdout,
                                           errors='backslashreplace')

    if hasattr(sys.stderr, 'buffer'):
        log_stderr = codecs.getwriter(enc)(sys.stderr.buffer,
                                           errors='backslashreplace')
    else:
        log_stderr = codecs.getwriter(enc)(sys.stderr,
                                           errors='backslashreplace')

    #the encoding relates to the lexicon, not the standard IO
    #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout;
    #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr;

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=log_stdout)
        if not model:
            return 1
        if options.testSample or options.applySample or options.applyWord:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options,
                 log_stdout)
        translator.reportStats(log_stdout)

    if options.applySample:
        mainApply(translator, options,
                  gOpenOut('-', options.encoding or defaultEncoding))
        translator.reportStats(log_stderr)

    if options.applyWord:
        mainApplyWord(translator, options, log_stdout)
Пример #7
0
def load_g2p(model_path):
    sequitur_options = Values()
    sequitur_options.modelFile = model_path
    sequitur_options.resume_from_checkpoint = False
    sequitur_options.shouldRampUp = False
    sequitur_options.trainSample = False
    sequitur_options.shouldTranspose = False
    sequitur_options.shouldSelfTest = False
    sequitur_options.newModelFile = False
    model = SequiturTool.procureModel(sequitur_options, None)
    if not model:
        print('Can\'t load g2p model.')
        sys.exit(1)
    return model
Пример #8
0
    def __init__(self, modelfn=SEQUITUR_MODEL):

        options = SeqOptionsObject()
        options.resume_from_checkpoint = False
        options.modelFile              = modelfn
        options.shouldRampUp           = False
        options.trainSample            = None
        options.shouldTranspose        = False
        options.newModelFile           = None
        options.shouldSelfTest         = False

        self.model = SequiturTool.procureModel(options, loadG2PSample, log=sys.stdout)

        self.translator = Translator(self.model)
Пример #9
0
def main(options, args):
    model = SequiturTool.procureModel(options, loadSample)
    if options.applySample:
        lines = gopen(options.applySample).readlines()
        words = Set([word for line in lines for word in line.split()])
        addUnknowns(model, words)
        translator = Translator(model)
        for line in lines:
            left = tuple(line.split())
            try:
                result = translator(left)
                print(" ".join(result))
            except translator.TranslationFailure:
                print("<translation-failed/>")
Пример #10
0
def main(options, args):
    model = SequiturTool.procureModel(options, loadSample)
    if options.applySample:
        lines = gopen(options.applySample).readlines()
        words = Set([ word for line in lines for word in line.split() ])
        addUnknowns(model, words)
        translator = Translator(model)
        for line in lines:
            left = tuple(line.split())
            try:
                result = translator(left)
                print ' '.join(result)
            except translator.TranslationFailure:
                print '<translation-failed/>'
Пример #11
0
 def __init__(self, model_path):
     class options(object):
         pass  
     options = options()
     options.testSample = None
     options.modelFile = model_path  
     options.trainSample = None  
     options.encoding = 'ISO-8859-15'  
     options.shouldInitializeWithCounts = None  
     options.psyco = None  
     options.stack_limit = None  
     options.shouldTranspose = None  
     options.applySample = 'args.txt'  
     options.shouldRampUp = None  
     options.resume_from_checkpoint = None  
     options.lengthConstraints = None  
     options.checkpoint = None  
     options.eager_discount_adjustment = None  
     options.fakeTranslator = None  
     options.tempdir = None  
     options.profile = None  
     options.variants_number = None  
     options.maxIterations = 100  
     options.testResult = None  
     options.variants_mass = None  
     options.shouldSuppressNewMultigrams = None  
     options.develSample = None  
     options.shouldWipeModel = None  
     options.resource_usage = None  
     options.test_segmental = None  
     options.fixed_discount = None  
     options.newModelFile = None  
     options.minIterations = 20  
     options.shouldSelfTest = None  
     options.viterbi = None  
     options.shouldTestContinuously = None  
     options.phoneme_to_phoneme = None
     
     import codecs
     global defaultEncoding
     defaultEncoding = options.encoding
     global stdout, stderr
     encoder, decoder, streamReader, streamWriter = codecs.lookup(options.encoding)
     stdout = streamWriter(sys.stdout)
     stderr = streamWriter(sys.stderr)
     loadSample = loadG2PSample
     model = SequiturTool.procureModel(options, loadSample, log=stdout)
     self.translator = Translator(model)
Пример #12
0
def mainTest(translator, testSample, options):
    if options.shouldTranspose:
        testSample = SequiturTool.transposeSample(testSample)
    if options.testResult:
        resultFile = gOpenOut(options.testResult, defaultEncoding)
    else:
        resultFile = None
    from Evaluation import Evaluator
    evaluator = Evaluator()
    evaluator.setSample(testSample)
    evaluator.resultFile = resultFile
    evaluator.verboseLog = stdout
    if options.test_segmental:
        supraSegmental = set(['.', "'", '"'])
        def removeSupraSegmental(phon):
            return filter(lambda p: p not in supraSegmental, phon)
        evaluator.compareFilter = removeSupraSegmental
    result = evaluator.evaluate(translator)
    print >> stdout, result
Пример #13
0
# ===========================================================================
def main(options, args):
    model = SequiturTool.procureModel(options, loadSample)
    if options.applySample:
        lines = gopen(options.applySample).readlines()
        words = Set([ word for line in lines for word in line.split() ])
        addUnknowns(model, words)
        translator = Translator(model)
        for line in lines:
            left = tuple(line.split())
            try:
                result = translator(left)
                print ' '.join(result)
            except translator.TranslationFailure:
                print '<translation-failed/>'

# ===========================================================================
if __name__ == '__main__':
    import optparse, tool
    optparser = optparse.OptionParser(
        usage   = '%prog [OPTION]... FILE...\n' + __doc__,
        version = '%prog ' + __version__)
    SequiturTool.addOptions(optparser)
    tool.addTrainOptions(optparser)
    optparser.add_option(
        '-a', '--apply', dest='applySample',
        help='apply translation to sentences read from FILE', metavar='FILE')
    options, args = optparser.parse_args()

    tool.run(main, options, args)
Пример #14
0
        mainTest(translator, loadSample(options.testSample), options)
        translator.reportStats(sys.stdout)

    if options.applySample:
        mainApply(translator, options)
        translator.reportStats(sys.stderr)


# ===========================================================================
if __name__ == '__main__':
    import optparse, tool
    optparser = optparse.OptionParser(usage='%prog [OPTION]... FILE...\n' +
                                      str(__doc__),
                                      version='%prog ' + __version__)
    tool.addOptions(optparser)
    SequiturTool.addTrainOptions(optparser)
    optparser.add_option('-e',
                         '--encoding',
                         default='ISO-8859-15',
                         help='use character set encoding ENC',
                         metavar='ENC')
    optparser.add_option('-P',
                         '--phoneme-to-phoneme',
                         action='store_true',
                         help='train/apply a phoneme-to-phoneme converter')
    optparser.add_option(
        '--test-segmental',
        action='store_true',
        help=
        'evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks'
    )
Пример #15
0
    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options)
        translator.reportStats(sys.stdout)

    if options.applySample:
        mainApply(translator, options)
        translator.reportStats(sys.stderr)

# ===========================================================================
if __name__ == '__main__':
    import optparse, tool
    optparser = optparse.OptionParser(
        usage   = '%prog [OPTION]... FILE...\n' + __doc__,
        version = '%prog ' + __version__)
    tool.addOptions(optparser)
    SequiturTool.addTrainOptions(optparser)
    optparser.add_option(
        '-e', '--encoding', default='ISO-8859-15',
        help='use character set encoding ENC', metavar='ENC')
    optparser.add_option(
        '-P', '--phoneme-to-phoneme', action='store_true',
        help='train/apply a phoneme-to-phoneme converter')
    optparser.add_option(
        '--test-segmental', action='store_true',
        help='evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks')
    optparser.add_option(
        '-B', '--result', dest='testResult',
        help='store test result in table FILE (for use with bootlog or R)', metavar='FILE')
    optparser.add_option(
        '-a', '--apply', dest='applySample',
        help='apply grapheme-to-phoneme conversion to words read from FILE', metavar='FILE')
Пример #16
0
        translator = Translator(model)
        for line in lines:
            left = tuple(line.split())
            try:
                result = translator(left)
                print(" ".join(result))
            except translator.TranslationFailure:
                print("<translation-failed/>")


# ===========================================================================
if __name__ == "__main__":
    import optparse
    import tool

    optparser = optparse.OptionParser(usage="%prog [OPTION]... FILE...\n" +
                                      __doc__,
                                      version="%prog " + __version__)
    SequiturTool.addOptions(optparser)
    tool.addTrainOptions(optparser)
    optparser.add_option(
        "-a",
        "--apply",
        dest="applySample",
        help="apply translation to sentences read from FILE",
        metavar="FILE",
    )
    options, args = optparser.parse_args()

    tool.run(main, options, args)
Пример #17
0
def getOptParser():
    import optparse, tool
    optparser = optparse.OptionParser(usage='%prog [OPTION]... FILE...\n' +
                                      str(__doc__),
                                      version='%prog ' + __version__)
    tool.addOptions(optparser)
    SequiturTool.addTrainOptions(optparser)
    optparser.add_option('-e',
                         '--encoding',
                         default='ISO-8859-15',
                         help='use character set encoding ENC',
                         metavar='ENC')
    optparser.add_option('-P',
                         '--phoneme-to-phoneme',
                         action='store_true',
                         help='train/apply a phoneme-to-phoneme converter')
    optparser.add_option(
        '--test-segmental',
        action='store_true',
        help=
        'evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks'
    )
    optparser.add_option(
        '-B',
        '--result',
        dest='testResult',
        help='store test result in table FILE (for use with bootlog or R)',
        metavar='FILE')
    optparser.add_option(
        '-a',
        '--apply',
        dest='applySample',
        help='apply grapheme-to-phoneme conversion to words read from FILE',
        metavar='FILE')
    optparser.add_option('-w',
                         '--word',
                         dest='applyWord',
                         help='apply grapheme-to-phoneme conversion to word',
                         metavar='string')
    optparser.add_option(
        '-V',
        '--variants-mass',
        type='float',
        help=
        'generate pronunciation variants until \sum_i p(var_i) >= Q (only effective with --apply)',
        metavar='Q')
    optparser.add_option(
        '--variants-number',
        type='int',
        help=
        'generate up to N pronunciation variants (only effective with --apply)',
        metavar='N')
    optparser.add_option(
        '-f',
        '--fake',
        dest='fakeTranslator',
        help=
        'use a translation memory (read from sample FILE) instead of a genuine model (use in combination with -x to evaluate two files against each other)',
        metavar='FILE')
    optparser.add_option('--stack-limit',
                         type='int',
                         help='limit size of search stack to N elements',
                         metavar='N')
    return optparser