def main(): """Entry point of the 'wordseg-puddle' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-puddle', description=__doc__, add_arguments=_add_arguments, train_file=True) # post-process arguments if args.train_file and (args.njobs or args.nfolds): raise ValueError( '--train-file option is incompatible with --njobs and --nfolds') args.njobs = args.njobs or 1 args.nfolds = args.nfolds or 5 # load the train text if any train_text = None if args.train_file: if not os.path.isfile(args.train_file): raise RuntimeError(f'test file not found: {args.train_file}') train_text = codecs.open(args.train_file, 'r', encoding='utf8') # load train and test texts, ignore empty lines test_text = (line for line in streamin if line) if train_text: train_text = (line for line in train_text if line) segmented = segment(test_text, train_text=train_text, window=args.window, by_frequency=args.by_frequency, nfolds=args.nfolds, njobs=args.njobs, log=log) streamout.write('\n'.join(segmented) + '\n')
def main(): """Entry point of the 'wordseg-baseline' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-baseline', description=__doc__, add_arguments=_add_arguments) # setup the seed for random number generation if args.random: log.info('setup random seed to %s', args.random) random.seed(args.random) if args.oracle: # load the oracle text if not os.path.isfile(args.oracle): raise ValueError('oracle file not found: {}'.format(args.oracle)) oracle_text = list(codecs.open(args.oracle, 'r')) log.info('loaded %s utterances from oracle text', len(oracle_text)) # init the oracle tokens separator oracle_separator = Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) segmented = segment_oracle(streamin, oracle_text, oracle_separator, args.level, log=log) else: segmented = segment(streamin, probability=args.probability, log=log) streamout.write('\n'.join(segmented) + '\n')
def main(): """Entry point of the 'wordseg-stats' command""" # options description def add_arguments(parser): parser.add_argument( '--json', action='store_true', help='print the results in JSON format, else print in raw text') # command initialization streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-stats', description=__doc__, add_arguments=add_arguments, separator=Separator()) # compute the statistics stats = CorpusStatistics(streamin, separator, log=log) results = stats.describe_all() # display the results either as a JSON string or in raw text if args.json: streamout.write((json.dumps(results, indent=4)) + '\n') else: out = (' '.join((name, k, str(v))) for name, stats in results.items() for k, v in stats.items()) streamout.write('\n'.join(out) + '\n')
def main(): """Entry point of the 'wordseg-tp' command""" # command initialization streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-tp', description=__doc__, add_arguments=_add_arguments) # if the deprecated --probability option is used, raise a warning # and convert it to the new --dependency option. if args.probability is not None: log.warning('''-p/--probability option is deprecated (maintained for backward compatibility), please use -d/--dependency instead.''') if args.probability == 'forward': args.dependency = 'ftp' else: # 'backward' args.dependency = 'btp' # segment the input text text = segment(streamin, threshold=args.threshold, dependency=args.dependency, log=log) # output the result streamout.write('\n'.join(text) + '\n')
def main(): """Entry point of the 'wordseg-eval' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-eval', description=__doc__, add_arguments=_add_arguments) log.info('loads input and gold texts') # load the gold text as a list of utterances, remove empty lines gold = _load_text(codecs.open(args.gold, 'r', encoding='utf8')) # load the text as a list of utterances, remove empty lines text = _load_text(streamin) # load the prepared (unsegmented) text as a list of utterances, # remove empty lines if args.rand_index: units = _load_text(codecs.open(args.rand_index, 'r', encoding='utf8')) else: units = None # evaluation returns a dict of 'score name' -> float log.info('evaluates the segmentation') results = evaluate(text, gold, units=units) streamout.write('\n'.join( # display scores with 4-digit float precision '{}\t{}'.format(k, '%.4g' % v if v is not None else 'None') for k, v in results.items()) + '\n') if args.summary: log.info('computes errors summary, writes to %s', args.summary) with codecs.open(args.summary, 'w', encoding='utf8') as fsummary: fsummary.write(json.dumps(summary(text, gold), indent=4))
def main(): """Entry point of the 'wordseg-puddle' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-puddle', description=__doc__, add_arguments=_add_arguments) segmented = segment(streamin, window=args.window, nfolds=args.nfolds, njobs=args.njobs, log=log) streamout.write('\n'.join(segmented) + '\n')
def main(): """Entry point of the 'wordseg-dibs' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-dibs', description=__doc__, add_arguments=_add_arguments) # setup the separator from parsed arguments separator = Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) # ensure the train file exists if not os.path.isfile(args.train_file): raise ValueError('train file does not exist: {}'.format( args.train_file)) # load train and test texts, ignore empty lines train_text = codecs.open(args.train_file, 'r', encoding='utf8') train_text = (line for line in train_text if line) test_text = (line for line in streamin if line) # train the model (learn diphone statistics) dibs_summary = CorpusSummary(train_text, separator=separator, level=args.unit, log=log) # segment the test text on the trained model output = segment(test_text, dibs_summary, type=args.type, threshold=args.threshold, pwb=args.pboundary, log=log) # output the segmented text streamout.write('\n'.join(output) + '\n') # save the computed diphones if required if args.diphones: log.info('saving %s diphones to %s', len(dibs_summary.diphones), args.diphones) output = ('{} {} {}'.format(v, k[0], k[1]) for k, v in sorted(dibs_summary.diphones.items(), key=operator.itemgetter(1), reverse=True)) codecs.open(args.diphones, 'w', encoding='utf8').write('\n'.join(output) + '\n')
def main(): """Entry point of the 'wordseg-dpseg' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-dpseg', description=__doc__, add_arguments=_add_arguments) assert args.nfolds > 0 # build the dpseg command arguments dpseg_args = {} excluded_args = ['verbose', 'quiet', 'input', 'output', 'nfolds', 'njobs'] for k, v in vars(args).items(): # ignored arguments if k in excluded_args or (v in (None, False) and v != 0): continue if k == 'estimator': v = { 'viterbi': 'V', 'flip': 'F', 'decayed-flip': 'D', 'tree': 'T' }[v] if k == 'ngram': v = {'unigram': 1, 'bigram': 2}[v] if k == 'forget_method': v = {'uniformly': 'U', 'proportional': 'P'}[v] dpseg_args[k.replace('_', '-')] = v dpseg_args = ' '.join('--{} {}'.format(k, v) for k, v in dpseg_args.items()) # adapt boolean values for orig, new in (('--eval-maximize True', '--eval-maximize 1'), ('--eval-maximize False', ''), ('--do-mbdp True', '--do-mbdp 1'), ('--do-mbdp False', '')): dpseg_args = dpseg_args.replace(orig, new) segmented = segment(streamin, nfolds=args.nfolds, njobs=args.njobs, args=dpseg_args, log=log) streamout.write('\n'.join(segmented) + '\n')
def main(): """Entry point of the 'wordseg-tp' command""" # command initialization streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-tp', description=__doc__, add_arguments=_add_arguments, train_file=True) # if the deprecated --probability option is used, raise a warning # and convert it to the new --dependency option. if args.probability is not None: log.warning('''-p/--probability option is deprecated (maintained for backward compatibility), please use -d/--dependency instead.''') if args.probability == 'forward': args.dependency = 'ftp' else: # 'backward' args.dependency = 'btp' # load the train text if any train_text = None if args.train_file: if not os.path.isfile(args.train_file): raise RuntimeError('test file not found: {}'.format( args.train_file)) train_text = codecs.open(args.train_file, 'r', encoding='utf8') # load train and test texts, ignore empty lines test_text = (line for line in streamin if line) if train_text: train_text = (line for line in train_text if line) # segment the input text with the train text text = segment(test_text, train_text=train_text, threshold=args.threshold, dependency=args.dependency, log=log) # output the result streamout.write('\n'.join(text) + '\n')
def main(): """Entry point of the 'wordseg-syll' command""" streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-syll', description=__doc__, separator=utils.Separator(' ', ';esyll', ';eword'), add_arguments=_add_arguments) # loads the onsets if not os.path.isfile(args.onsets_file): raise RuntimeError('unknown onsets file "{}"'.format(args.onsets_file)) onsets = Syllabifier.open_datafile(args.onsets_file) # loads the vowels if not os.path.isfile(args.vowels_file): raise RuntimeError('unknown vowels file "{}"'.format(args.vowels_file)) vowels = Syllabifier.open_datafile(args.vowels_file) log.info('loaded %s onsets', len(onsets)) log.debug('onsets are "%s"', ', '.join(onsets)) log.info('loaded %s vowels', len(vowels)) log.debug('vowels are "%s"', ', '.join(vowels)) log.debug('separator is %s', separator) syllabifier = Syllabifier(onsets, vowels, separator=separator, filling_vowel=args.filling_vowel, log=log) # syllabify the input text sylls = utils.CountingIterator( syllabifier.syllabify(streamin, strip=args.strip, tolerant=args.tolerant)) # display the output log.info('syllabified %s utterances', sylls.count) streamout.write('\n'.join(sylls) + '\n')
def main(): """Entry point of the 'wordseg-ag' command""" # initializing standard i/o and arguments streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-ag', description="""Adaptor Grammar word segmentation algorithm""", add_arguments=_add_arguments, train_file=True) # build the AG command line (C++ side) from the parsed arguments # (Python side) cmd_args = _command_line_arguments(args) # loads the train text if any train_text = None if args.train_file: if not os.path.isfile(args.train_file): raise ValueError( 'train file does not exist: {}'.format(args.train_file)) train_text = codecs.open(args.train_file, 'r', encoding='utf8') # call the AG algorithm segmented = segment( streamin, train_text=train_text, grammar_file=args.grammar, category=args.category, args=cmd_args, save_grammar_to=args.save_grammar_to, ignore_first_parses=args.ignore_first_parses, nruns=args.nruns, njobs=args.njobs, tempdir=args.tempdir, log=log) # output the results streamout.write('\n'.join(segmented) + '\n')
def main(): """Entry point of the 'wordseg-dibs' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-dibs', description=__doc__, add_arguments=_add_arguments, train_file=True) # setup the separator from parsed arguments separator = Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) # load test text as a list of utterances, ignore empty lines test_text = [line for line in streamin if line] log.info('loaded %s utterances as test data', len(test_text)) # user provided a train text, ensure it is valid and that test_text does # not include word separators if args.train_file: if not os.path.isfile(args.train_file): raise ValueError( 'train file specified but does not exist: {}'.format( args.train_file)) # make sure test_text is in prepared form for n, line in enumerate(test_text): if separator.word in line: raise ValueError( f'word separator found in test text (line {n+1})') # load train and test texts, ignore empty lines train_text = codecs.open(args.train_file, 'r', encoding='utf8') train_text = [line for line in train_text if line] log.info('loaded %s utterances as train data', len(train_text)) else: log.info('using test data for training') # the presence of word separator in train utterance will be checked # during training train_text = test_text # remove the word separators for testing test_text = prepare(test_text) # train the model (learn diphone statistics) trained_model = CorpusSummary(train_text, separator=separator, level=args.unit, log=log) # segment the test text on the trained model segmented = segment(test_text, trained_model, type=args.type, threshold=args.threshold, pwb=args.pboundary, log=log) # output the segmented text streamout.write('\n'.join(segmented) + '\n') # save the computed diphones if required if args.diphones: log.info('saving %s diphones to %s', len(trained_model.diphones), args.diphones) output = ('{} {} {}'.format(v, k[0], k[1]) for k, v in sorted(trained_model.diphones.items(), key=operator.itemgetter(1), reverse=True)) codecs.open(args.diphones, 'w', encoding='utf8').write('\n'.join(output) + '\n')
def main(): """Entry point of the 'wordseg-prep' command""" # add a command-specific argument def add_arguments(parser): parser.add_argument('-u', '--unit', type=str, choices=['phone', 'syllable'], default='phone', help=''' output level representation, must be "phone" or "syllable"''') parser.add_argument( '-t', '--tolerant', action='store_true', help='''tolerate the badly formated utterances in input, but ignore them in output (default is to exit on the first encountered error)''') parser.add_argument( '-P', '--punctuation', action='store_true', help='punctuation characters are not considered illegal') group = [ g for g in parser._action_groups if g.title == 'input/output arguments' ][0] group.add_argument( '-g', '--gold', type=str, metavar='<gold-file>', help='''generates the gold text to the specified file, do not generate gold if no file specified''') # command initialization streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-prep', description=__doc__, separator=utils.Separator(' ', ';esyll', ';eword'), add_arguments=add_arguments) streamin = list(streamin) log.debug('separator is %s', separator) log.info('preparing the text at {} level'.format(args.unit)) # check all the utterances are correctly formatted. prep = utils.CountingIterator( prepare(streamin, separator, unit=args.unit, log=log, check_punctuation=not args.punctuation, tolerant=args.tolerant)) # write prepared text, one utterance a line, ending with a newline streamout.write('\n'.join(prep) + '\n') log.info('prepared %s utterances', prep.count) if args.gold: log.info('generating gold text to %s', args.gold) gold_text = gold(streamin, separator=separator) open(args.gold, 'w').write('\n'.join(gold_text) + '\n')