def test_fold(): # a single fold assert folding.fold([1, 2, 3], 1) == \ ([[1, 2, 3]], [0]) # each group have 1 element assert folding.fold([1, 2, 3], 3) == \ ([[1, 2, 3], [3, 1, 2], [2, 3, 1]], [2, 2, 2]) # here the last group is [3, 4] assert folding.fold([1, 2, 3, 4], 3) == \ ([[1, 2, 3, 4], [3, 4, 1, 2], [2, 3, 4, 1]], [2, 3, 3])
def segment(text, nfolds=5, njobs=1, args='--ngram 1 --a1 0 --b1 1', log=utils.null_logger(), binary=utils.get_binary('dpseg')): """Run the 'dpseg' binary on `nfolds` folds""" # force the text to be a list of utterances text = list(text) # set of unique units (syllables or phones) present in the text units = set(unit for utt in text for unit in utt.split()) log.info('%s units found in %s utterances', len(units), len(text)) # create a unicode equivalent for each unit and convert the text # to that unicode version log.debug('converting input to unicode') unicode_gen = UnicodeGenerator() unicode_mapping = {unit: unicode_gen() for unit in units} unicode_text = [ ''.join(unicode_mapping[unit] for unit in utt.split()) for utt in text ] log.debug('building %s folds', nfolds) fold_boundaries = _dpseg_bugfix(unicode_text, folding.boundaries(unicode_text, nfolds), log) folded_texts, fold_index = folding.fold(unicode_text, nfolds, fold_boundaries=fold_boundaries) segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_dpseg)(fold, args, log_level=log.getEffectiveLevel(), log_name='wordseg-dpseg - fold {}'.format(n + 1), binary=binary) for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) # convert the text back to unit level (from unicode level) log.debug('converting output back from unicode') unit_mapping = {v: k for k, v in unicode_mapping.items()} unit_mapping[' '] = ' ' segmented_text = (''.join(unit_mapping[char] for char in utt) for utt in output_text) return (utt for utt in segmented_text if utt)
def segment(text, window=2, nfolds=5, njobs=1, log=utils.null_logger()): """Returns a word segmented version of `text` using the puddle algorithm Parameters ---------- text : sequence A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and comlete utterance. window : int, optional Number of phonemes to be taken into account for boundary constraint. nfolds : int, optional The number of folds to segment the `text` on. njobs : int, optional The number of subprocesses to run in parallel. The folds are independant of each others and can be computed in parallel. Requesting a number of jobs greater then `nfolds` have no effect. log : logging.Logger, optional The logger instance where to send messages. Returns ------- generator The utterances from `text` with estimated words boundaries. See also -------- wordseg.folding.fold """ # force the text to be a list of utterances text = list(text) log.debug('building %s folds', nfolds) folded_texts, fold_index = folding.fold(text, nfolds) segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_puddle)(fold, window, log_level=log.getEffectiveLevel(), log_name='wordseg-puddle - fold {}'.format(n + 1)) for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) return (utt for utt in output_text if utt)
def test_fold_unfold_nfolds(nfolds, tags): folds, index = folding.fold(tags, nfolds) assert folding.unfold(folds, index) == tags
def test_unfold_basic(nfolds): folds, index = folding.fold([1, 2, 3], nfolds) assert folding.unfold(folds, index) == [1, 2, 3]
def segment(text, train_text=None, window=2, by_frequency=False, nfolds=5, njobs=1, log=utils.null_logger()): """Returns a word segmented version of `text` using the puddle algorithm Parameters ---------- text : sequence of str A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and complete utterance. train_text : sequence of str The list of utterances to train the model on. If None (default) the model is trained online during segmentation. When `train_text` is specified, the options `nfolds` and `njobs` are ignored. window : int, optional Number of phonemes to be taken into account for boundary constraint. Default to 2. by_frequency : bool, optional When True choose the word candidates by filterring them by frequency. Default to False. nfolds : int, optional The number of folds to segment the `text` on. This option is ignored if a `train_text` is provided. njobs : int, optional The number of subprocesses to run in parallel. The folds are independant of each others and can be computed in parallel. Requesting a number of jobs greater then `nfolds` have no effect. This option is ignored if a `train_text` is provided. log : logging.Logger, optional The logger instance where to send messages. Returns ------- generator The utterances from `text` with estimated words boundaries. See also -------- wordseg.folding.fold """ # force the text to be a list of utterances text = list(text) if not train_text: log.info('not train data provided, will train model on test data') log.debug('building %s folds', nfolds) folded_texts, fold_index = folding.fold(text, nfolds) # segment the folds in parallel segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_do_puddle)(fold, window, by_frequency, log.getEffectiveLevel(), f'wordseg-puddle - fold {n+1}') for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) return (utt for utt in output_text if utt) # force the train text from sequence to list train_text = list(train_text) log.info('train data: %s utterances loaded', len(train_text)) # init a puddle model and train it model = Puddle(window=window, by_frequency=by_frequency, log=log) model.train(train_text) # segmentation of the test text, keeping the model constant return (utt for utt in model.segment(text, update_model=False) if utt)