Пример #1
0
def main():
    """Entry point of the 'wordseg-puddle' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-puddle',
        description=__doc__,
        add_arguments=_add_arguments,
        train_file=True)

    # post-process arguments
    if args.train_file and (args.njobs or args.nfolds):
        raise ValueError(
            '--train-file option is incompatible with --njobs and --nfolds')
    args.njobs = args.njobs or 1
    args.nfolds = args.nfolds or 5

    # load the train text if any
    train_text = None
    if args.train_file:
        if not os.path.isfile(args.train_file):
            raise RuntimeError(f'test file not found: {args.train_file}')
        train_text = codecs.open(args.train_file, 'r', encoding='utf8')

    # load train and test texts, ignore empty lines
    test_text = (line for line in streamin if line)
    if train_text:
        train_text = (line for line in train_text if line)

    segmented = segment(test_text,
                        train_text=train_text,
                        window=args.window,
                        by_frequency=args.by_frequency,
                        nfolds=args.nfolds,
                        njobs=args.njobs,
                        log=log)
    streamout.write('\n'.join(segmented) + '\n')
Пример #2
0
def main():
    """Entry point of the 'wordseg-baseline' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-baseline',
        description=__doc__,
        add_arguments=_add_arguments)

    # setup the seed for random number generation
    if args.random:
        log.info('setup random seed to %s', args.random)
    random.seed(args.random)

    if args.oracle:
        # load the oracle text
        if not os.path.isfile(args.oracle):
            raise ValueError('oracle file not found: {}'.format(args.oracle))
        oracle_text = list(codecs.open(args.oracle, 'r'))
        log.info('loaded %s utterances from oracle text', len(oracle_text))

        # init the oracle tokens separator
        oracle_separator = Separator(phone=args.phone_separator,
                                     syllable=args.syllable_separator,
                                     word=args.word_separator)

        segmented = segment_oracle(streamin,
                                   oracle_text,
                                   oracle_separator,
                                   args.level,
                                   log=log)
    else:
        segmented = segment(streamin, probability=args.probability, log=log)

    streamout.write('\n'.join(segmented) + '\n')
Пример #3
0
def main():
    """Entry point of the 'wordseg-stats' command"""

    # options description
    def add_arguments(parser):
        parser.add_argument(
            '--json',
            action='store_true',
            help='print the results in JSON format, else print in raw text')

    # command initialization
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-stats',
        description=__doc__,
        add_arguments=add_arguments,
        separator=Separator())

    # compute the statistics
    stats = CorpusStatistics(streamin, separator, log=log)
    results = stats.describe_all()

    # display the results either as a JSON string or in raw text
    if args.json:
        streamout.write((json.dumps(results, indent=4)) + '\n')
    else:
        out = (' '.join((name, k, str(v))) for name, stats in results.items()
               for k, v in stats.items())
        streamout.write('\n'.join(out) + '\n')
Пример #4
0
def main():
    """Entry point of the 'wordseg-tp' command"""
    # command initialization
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-tp', description=__doc__, add_arguments=_add_arguments)

    # if the deprecated --probability option is used, raise a warning
    # and convert it to the new --dependency option.
    if args.probability is not None:
        log.warning('''-p/--probability option is deprecated (maintained for
            backward compatibility), please use -d/--dependency instead.''')

        if args.probability == 'forward':
            args.dependency = 'ftp'
        else:  # 'backward'
            args.dependency = 'btp'

    # segment the input text
    text = segment(streamin,
                   threshold=args.threshold,
                   dependency=args.dependency,
                   log=log)

    # output the result
    streamout.write('\n'.join(text) + '\n')
Пример #5
0
def main():
    """Entry point of the 'wordseg-eval' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-eval', description=__doc__, add_arguments=_add_arguments)

    log.info('loads input and gold texts')

    # load the gold text as a list of utterances, remove empty lines
    gold = _load_text(codecs.open(args.gold, 'r', encoding='utf8'))

    # load the text as a list of utterances, remove empty lines
    text = _load_text(streamin)

    # load the prepared (unsegmented) text as a list of utterances,
    # remove empty lines
    if args.rand_index:
        units = _load_text(codecs.open(args.rand_index, 'r', encoding='utf8'))
    else:
        units = None

    # evaluation returns a dict of 'score name' -> float
    log.info('evaluates the segmentation')
    results = evaluate(text, gold, units=units)

    streamout.write('\n'.join(
        # display scores with 4-digit float precision
        '{}\t{}'.format(k, '%.4g' % v if v is not None else 'None')
        for k, v in results.items()) + '\n')

    if args.summary:
        log.info('computes errors summary, writes to %s', args.summary)
        with codecs.open(args.summary, 'w', encoding='utf8') as fsummary:
            fsummary.write(json.dumps(summary(text, gold), indent=4))
Пример #6
0
def main():
    """Entry point of the 'wordseg-puddle' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-puddle',
        description=__doc__,
        add_arguments=_add_arguments)

    segmented = segment(streamin,
                        window=args.window,
                        nfolds=args.nfolds,
                        njobs=args.njobs,
                        log=log)
    streamout.write('\n'.join(segmented) + '\n')
Пример #7
0
def main():
    """Entry point of the 'wordseg-dibs' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-dibs', description=__doc__, add_arguments=_add_arguments)

    # setup the separator from parsed arguments
    separator = Separator(phone=args.phone_separator,
                          syllable=args.syllable_separator,
                          word=args.word_separator)

    # ensure the train file exists
    if not os.path.isfile(args.train_file):
        raise ValueError('train file does not exist: {}'.format(
            args.train_file))

    # load train and test texts, ignore empty lines
    train_text = codecs.open(args.train_file, 'r', encoding='utf8')
    train_text = (line for line in train_text if line)
    test_text = (line for line in streamin if line)

    # train the model (learn diphone statistics)
    dibs_summary = CorpusSummary(train_text,
                                 separator=separator,
                                 level=args.unit,
                                 log=log)

    # segment the test text on the trained model
    output = segment(test_text,
                     dibs_summary,
                     type=args.type,
                     threshold=args.threshold,
                     pwb=args.pboundary,
                     log=log)

    # output the segmented text
    streamout.write('\n'.join(output) + '\n')

    # save the computed diphones if required
    if args.diphones:
        log.info('saving %s diphones to %s', len(dibs_summary.diphones),
                 args.diphones)

        output = ('{} {} {}'.format(v, k[0], k[1])
                  for k, v in sorted(dibs_summary.diphones.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True))

        codecs.open(args.diphones, 'w',
                    encoding='utf8').write('\n'.join(output) + '\n')
Пример #8
0
def main():
    """Entry point of the 'wordseg-dpseg' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-dpseg',
        description=__doc__,
        add_arguments=_add_arguments)

    assert args.nfolds > 0

    # build the dpseg command arguments
    dpseg_args = {}
    excluded_args = ['verbose', 'quiet', 'input', 'output', 'nfolds', 'njobs']
    for k, v in vars(args).items():
        # ignored arguments
        if k in excluded_args or (v in (None, False) and v != 0):
            continue

        if k == 'estimator':
            v = {
                'viterbi': 'V',
                'flip': 'F',
                'decayed-flip': 'D',
                'tree': 'T'
            }[v]

        if k == 'ngram':
            v = {'unigram': 1, 'bigram': 2}[v]

        if k == 'forget_method':
            v = {'uniformly': 'U', 'proportional': 'P'}[v]

        dpseg_args[k.replace('_', '-')] = v

    dpseg_args = ' '.join('--{} {}'.format(k, v)
                          for k, v in dpseg_args.items())

    # adapt boolean values
    for orig, new in (('--eval-maximize True',
                       '--eval-maximize 1'), ('--eval-maximize False', ''),
                      ('--do-mbdp True', '--do-mbdp 1'), ('--do-mbdp False',
                                                          '')):
        dpseg_args = dpseg_args.replace(orig, new)

    segmented = segment(streamin,
                        nfolds=args.nfolds,
                        njobs=args.njobs,
                        args=dpseg_args,
                        log=log)
    streamout.write('\n'.join(segmented) + '\n')
Пример #9
0
def main():
    """Entry point of the 'wordseg-tp' command"""
    # command initialization

    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-tp',
        description=__doc__,
        add_arguments=_add_arguments,
        train_file=True)

    # if the deprecated --probability option is used, raise a warning
    # and convert it to the new --dependency option.
    if args.probability is not None:
        log.warning('''-p/--probability option is deprecated (maintained for
            backward compatibility), please use -d/--dependency instead.''')

        if args.probability == 'forward':
            args.dependency = 'ftp'
        else:  # 'backward'
            args.dependency = 'btp'

    # load the train text if any
    train_text = None
    if args.train_file:
        if not os.path.isfile(args.train_file):
            raise RuntimeError('test file not found: {}'.format(
                args.train_file))
        train_text = codecs.open(args.train_file, 'r', encoding='utf8')

    # load train and test texts, ignore empty lines
    test_text = (line for line in streamin if line)
    if train_text:
        train_text = (line for line in train_text if line)

    # segment the input text with the train text
    text = segment(test_text,
                   train_text=train_text,
                   threshold=args.threshold,
                   dependency=args.dependency,
                   log=log)

    # output the result
    streamout.write('\n'.join(text) + '\n')
Пример #10
0
def main():
    """Entry point of the 'wordseg-syll' command"""
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-syll',
        description=__doc__,
        separator=utils.Separator(' ', ';esyll', ';eword'),
        add_arguments=_add_arguments)

    # loads the onsets
    if not os.path.isfile(args.onsets_file):
        raise RuntimeError('unknown onsets file "{}"'.format(args.onsets_file))
    onsets = Syllabifier.open_datafile(args.onsets_file)

    # loads the vowels
    if not os.path.isfile(args.vowels_file):
        raise RuntimeError('unknown vowels file "{}"'.format(args.vowels_file))
    vowels = Syllabifier.open_datafile(args.vowels_file)

    log.info('loaded %s onsets', len(onsets))
    log.debug('onsets are "%s"', ', '.join(onsets))
    log.info('loaded %s vowels', len(vowels))
    log.debug('vowels are "%s"', ', '.join(vowels))
    log.debug('separator is %s', separator)

    syllabifier = Syllabifier(onsets,
                              vowels,
                              separator=separator,
                              filling_vowel=args.filling_vowel,
                              log=log)

    # syllabify the input text
    sylls = utils.CountingIterator(
        syllabifier.syllabify(streamin,
                              strip=args.strip,
                              tolerant=args.tolerant))

    # display the output
    log.info('syllabified %s utterances', sylls.count)
    streamout.write('\n'.join(sylls) + '\n')
Пример #11
0
def main():
    """Entry point of the 'wordseg-ag' command"""
    # initializing standard i/o and arguments
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-ag',
        description="""Adaptor Grammar word segmentation algorithm""",
        add_arguments=_add_arguments,
        train_file=True)

    # build the AG command line (C++ side) from the parsed arguments
    # (Python side)
    cmd_args = _command_line_arguments(args)

    # loads the train text if any
    train_text = None
    if args.train_file:
        if not os.path.isfile(args.train_file):
            raise ValueError(
                'train file does not exist: {}'.format(args.train_file))
        train_text = codecs.open(args.train_file, 'r', encoding='utf8')

    # call the AG algorithm
    segmented = segment(
        streamin,
        train_text=train_text,
        grammar_file=args.grammar,
        category=args.category,
        args=cmd_args,
        save_grammar_to=args.save_grammar_to,
        ignore_first_parses=args.ignore_first_parses,
        nruns=args.nruns,
        njobs=args.njobs,
        tempdir=args.tempdir,
        log=log)

    # output the results
    streamout.write('\n'.join(segmented) + '\n')
Пример #12
0
def main():
    """Entry point of the 'wordseg-dibs' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-dibs',
        description=__doc__,
        add_arguments=_add_arguments,
        train_file=True)

    # setup the separator from parsed arguments
    separator = Separator(phone=args.phone_separator,
                          syllable=args.syllable_separator,
                          word=args.word_separator)

    # load test text as a list of utterances, ignore empty lines
    test_text = [line for line in streamin if line]
    log.info('loaded %s utterances as test data', len(test_text))

    # user provided a train text, ensure it is valid and that test_text does
    # not include word separators
    if args.train_file:
        if not os.path.isfile(args.train_file):
            raise ValueError(
                'train file specified but does not exist: {}'.format(
                    args.train_file))

        # make sure test_text is in prepared form
        for n, line in enumerate(test_text):
            if separator.word in line:
                raise ValueError(
                    f'word separator found in test text (line {n+1})')

        # load train and test texts, ignore empty lines
        train_text = codecs.open(args.train_file, 'r', encoding='utf8')
        train_text = [line for line in train_text if line]
        log.info('loaded %s utterances as train data', len(train_text))
    else:
        log.info('using test data for training')
        # the presence of word separator in train utterance will be checked
        # during training
        train_text = test_text

        # remove the word separators for testing
        test_text = prepare(test_text)

    # train the model (learn diphone statistics)
    trained_model = CorpusSummary(train_text,
                                  separator=separator,
                                  level=args.unit,
                                  log=log)

    # segment the test text on the trained model
    segmented = segment(test_text,
                        trained_model,
                        type=args.type,
                        threshold=args.threshold,
                        pwb=args.pboundary,
                        log=log)

    # output the segmented text
    streamout.write('\n'.join(segmented) + '\n')

    # save the computed diphones if required
    if args.diphones:
        log.info('saving %s diphones to %s', len(trained_model.diphones),
                 args.diphones)

        output = ('{} {} {}'.format(v, k[0], k[1])
                  for k, v in sorted(trained_model.diphones.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True))

        codecs.open(args.diphones, 'w',
                    encoding='utf8').write('\n'.join(output) + '\n')
Пример #13
0
def main():
    """Entry point of the 'wordseg-prep' command"""

    # add a command-specific argument
    def add_arguments(parser):
        parser.add_argument('-u',
                            '--unit',
                            type=str,
                            choices=['phone', 'syllable'],
                            default='phone',
                            help='''
            output level representation, must be "phone" or "syllable"''')

        parser.add_argument(
            '-t',
            '--tolerant',
            action='store_true',
            help='''tolerate the badly formated utterances in input,
            but ignore them in output (default is to exit on the first
            encountered error)''')

        parser.add_argument(
            '-P',
            '--punctuation',
            action='store_true',
            help='punctuation characters are not considered illegal')

        group = [
            g for g in parser._action_groups
            if g.title == 'input/output arguments'
        ][0]
        group.add_argument(
            '-g',
            '--gold',
            type=str,
            metavar='<gold-file>',
            help='''generates the gold text to the specified file,
            do not generate gold if no file specified''')

    # command initialization
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-prep',
        description=__doc__,
        separator=utils.Separator(' ', ';esyll', ';eword'),
        add_arguments=add_arguments)

    streamin = list(streamin)

    log.debug('separator is %s', separator)
    log.info('preparing the text at {} level'.format(args.unit))

    # check all the utterances are correctly formatted.
    prep = utils.CountingIterator(
        prepare(streamin,
                separator,
                unit=args.unit,
                log=log,
                check_punctuation=not args.punctuation,
                tolerant=args.tolerant))

    # write prepared text, one utterance a line, ending with a newline
    streamout.write('\n'.join(prep) + '\n')
    log.info('prepared %s utterances', prep.count)

    if args.gold:
        log.info('generating gold text to %s', args.gold)
        gold_text = gold(streamin, separator=separator)
        open(args.gold, 'w').write('\n'.join(gold_text) + '\n')