def __init__(self, onsets, vowels, separator=Separator(), filling_vowel=False, log=utils.null_logger()): self.onsets = onsets self.vowels = vowels self.separator = separator self.log = log # ensure onsets and vowels are not empty if not isinstance(vowels, list) or not len(vowels): raise ValueError('unvalid or empty vowels list') if not isinstance(onsets, list) or not len(onsets): raise ValueError('unvalid or empty onsets list') # concatenation of all chars in onsets and vowels (usefull to # detect any char during syllabification) self.symbols = (set(''.join(v for v in vowels)).union( set(''.join(o for o in onsets)))) # if defined, ensure the silent vowel is not already used if filling_vowel: # find a silent vowel (some char not already prensent in # the symbols) code = 1 while six.unichr(code) in self.symbols: code += 1 self.silent = six.unichr(code) self.symbols.add(self.silent) self.vowels.append(self.silent) else: self.silent = None
def __init__(self, window=2, by_frequency=False, log=utils.null_logger()): self._log = log self.window = window self.by_frequency = by_frequency self._lexicon = collections.Counter() self._beginning = collections.Counter() self._ending = collections.Counter()
def _dpseg_bugfix(text, boundaries, log=utils.null_logger()): """Ensure all folds have their first line with more than one symbol There is a bug in the C++ implementation of dpseg: when the first input line is composed of a single character (i.e. a unicode symbol), the program fails. To avoid this, this method ensures all the folds begin with at least two symbols. If this is not the case, the boundary position is moved to the next line containing at least two symbols. Raises ------ ValueError if the bugfix is needed and cannot be applied Notes ----- This implementation differs with the one in CDSWordSeg. In wordseg we modify the fold index and thus the fold size, whereas in CDSWordSeg we permute lines in the text (and thus to the gold file). In wordseg we don't want to expose the gold file at this level. """ # we have something to fix if one of those lengths is 1 first_len = [len(text[i]) for i in boundaries] if 1 not in first_len: log.debug('folds boundaries are OK for dpseg') return boundaries if first_len[0] == 1: raise ValueError('The input text\'s first line has a single symbol, ' 'this will cause wordseg-dpseg to bug. ' 'Please re-arrange your text manually and try again.') need_to_fix = [i for i, length in enumerate(first_len) if length == 1] log.debug('dpseg bugfix: need to fix folds {}'.format( [i + 1 for i in need_to_fix])) for i in need_to_fix: # find the first line of the fold with len >= 2 index = _find_first_line_with_min_len(text[boundaries[i]:], min_len=2) if index is None: raise ValueError( 'dpseg bugfix failed: all lines in the fold {} have len == 1'. format(i + 1)) log.debug('dpseg bugfix: fixing fold {} index from {} to {}'.format( i + 1, boundaries[i], boundaries[i] + index)) boundaries[i] += index if not boundaries == sorted(set(boundaries)): raise ValueError( 'dpseg bugfix failed: broke the folds order. ' 'Please re-arrange (shuffle) your text manually and try again.') return boundaries
def segment(text, nfolds=5, njobs=1, args='--ngram 1 --a1 0 --b1 1', log=utils.null_logger(), binary=utils.get_binary('dpseg')): """Run the 'dpseg' binary on `nfolds` folds""" # force the text to be a list of utterances text = list(text) # set of unique units (syllables or phones) present in the text units = set(unit for utt in text for unit in utt.split()) log.info('%s units found in %s utterances', len(units), len(text)) # create a unicode equivalent for each unit and convert the text # to that unicode version log.debug('converting input to unicode') unicode_gen = UnicodeGenerator() unicode_mapping = {unit: unicode_gen() for unit in units} unicode_text = [ ''.join(unicode_mapping[unit] for unit in utt.split()) for utt in text ] log.debug('building %s folds', nfolds) fold_boundaries = _dpseg_bugfix(unicode_text, folding.boundaries(unicode_text, nfolds), log) folded_texts, fold_index = folding.fold(unicode_text, nfolds, fold_boundaries=fold_boundaries) segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_dpseg)(fold, args, log_level=log.getEffectiveLevel(), log_name='wordseg-dpseg - fold {}'.format(n + 1), binary=binary) for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) # convert the text back to unit level (from unicode level) log.debug('converting output back from unicode') unit_mapping = {v: k for k, v in unicode_mapping.items()} unit_mapping[' '] = ' ' segmented_text = (''.join(unit_mapping[char] for char in utt) for utt in output_text) return (utt for utt in segmented_text if utt)
def segment_oracle(text, oracle_text, oracle_separator=Separator(), oracle_level='phone', log=utils.null_logger()): """Random oracle word segmentation The probability of word boundary :math:`p` is estimated from an `oracle` text as the ration ``nwords / (nphones or nsyllables)``, according to ``oracle_level``. The segmentation is then delegated to the segment(text, :math:`p`) method is called. Parameters ---------- text : sequence of str The input utterances to segment, tokens are assumed to be space separated. oracle_text : sequence of str The text on which to estimate the probaility of word boundary. Must be tokenized at word and at least phone or syllable levels (according to ``oracle_level``). oracle_separator : Separator, optional Token separation in the oracle text. oracle_level : str, optional The level to consider when estimating :math:`p`, must be 'phone' or 'syllable', default to 'phone'. log : logging.Logger Where to send log messages Yields ------ segmented_text : generator The randomly segmented utterances. """ # estimate the word probability boundary in the text nphones = sum( len(list(oracle_separator.tokenize(utt, level=oracle_level))) for utt in oracle_text) nwords = sum( len(list(oracle_separator.tokenize(utt, level='word'))) for utt in oracle_text) log.info('nwords = %s, n%ss = %s', nwords, oracle_level, nphones) if nwords == nphones: log.warning( 'nwords==nphones. Is the oracle\'s token separation correct?') probability = float(nwords) / float(nphones) return segment(text, probability, log=log)
def segment(text, window=2, nfolds=5, njobs=1, log=utils.null_logger()): """Returns a word segmented version of `text` using the puddle algorithm Parameters ---------- text : sequence A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and comlete utterance. window : int, optional Number of phonemes to be taken into account for boundary constraint. nfolds : int, optional The number of folds to segment the `text` on. njobs : int, optional The number of subprocesses to run in parallel. The folds are independant of each others and can be computed in parallel. Requesting a number of jobs greater then `nfolds` have no effect. log : logging.Logger, optional The logger instance where to send messages. Returns ------- generator The utterances from `text` with estimated words boundaries. See also -------- wordseg.folding.fold """ # force the text to be a list of utterances text = list(text) log.debug('building %s folds', nfolds) folded_texts, fold_index = folding.fold(text, nfolds) segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_puddle)(fold, window, log_level=log.getEffectiveLevel(), log_name='wordseg-puddle - fold {}'.format(n + 1)) for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) return (utt for utt in output_text if utt)
def __init__(self, text, separator=Separator(), level='phone', log=utils.null_logger()): if level not in ('phone', 'syllable'): raise ValueError( 'Unknown level {}, must be hone or syllable'.format(level)) log.info('reading data at %s level', level) self.separator = separator self.summary = Counter() self.lexicon = Counter() self.phrase_initial = Counter() self.phrase_final = Counter() self.internal_diphones = Counter() self.spanning_diphones = Counter() nremoved = 0 for index, utt in enumerate(text): # ignore empty lines (as in wordseg-prep, to have a # consistant behavior between the tools) and let the user # know how many lines we ignored if utt.strip() == '': log.debug('ignoring empty line %d', index + 1) nremoved += 1 else: if separator.word not in utt: raise ValueError( 'word separator "{}" not found in train text: line {}'. format(separator.word, index + 1)) self._read_utterance(utt, level) self.diphones = Counter(self.internal_diphones) for k, v in self.spanning_diphones.items(): self.diphones.increment(k, v) if nremoved > 0: log.info('ignored %d empty lines in train text', nremoved) log.info('train data summary: %s', self.summary)
def segment(text, probability=0.5, log=utils.null_logger()): """Random word segmentation given a boundary probability Given a probability :math:`p`, the probability :math:`P(t_i)` to add a word boundary after each token :math:`t_i` is: .. math:: P(t_i) = P(X < p), X \\sim \\mathcal{U}(0, 1). Parameters ---------- text : sequence The input utterances to segment, tokens are assumed to be space separated. probability: float, optional The probability to append a word boundary after each token. log : logging.Logger Where to send log messages Yields ------ segmented_text : generator The randomly segmented utterances. Raises ------ ValueError if the probability is not a float in [0, 1]. """ # make sure the probability is valid if not isinstance(probability, float): raise ValueError('probability must be a float') if probability < 0 or probability > 1: raise ValueError( 'probability must be in [0, 1], it is {}'.format(probability)) log.info('P(word boundary) = %s', probability) for utt in text: yield ''.join(token + ' ' if random.random() < probability else token for token in utt.strip().split(' '))
def __init__(self, corpus, separator, log=utils.null_logger()): self.log = log # check the separator have words and possibly phones self.separator = separator if not self.separator.word: raise ValueError('word separator not defined') if not self.separator.phone: log.warning('phone separator not defined, some stats ignored') if not self.separator.syllable: log.warning('syllable separator not defined, some stats ignored') self.log.info('token separator is %s', self.separator) # force to list and ignore empty lines self.corpus = [ utt for utt in (utt.strip() for utt in corpus) if len(utt) ] self.log.info('loaded %s utterances', len(self.corpus)) if len(self.corpus) == 0: raise ValueError('no text to load') # tokenize the entire text at each defined level ('word', # 'syllable' and/or 'phone') TODO can be optimized we are # tokenizing the entire text up to 3 times (implement nested # tokenization). self.tokens = {} for level in self.separator.levels()[::-1]: self.log.debug('tokenizing %s', level) self.tokens[level] = [ self.separator.tokenize(utt, level, keep_boundaries=False) for utt in self.corpus ] ntokens = sum(len(t) for t in self.tokens[level]) self.log.info('parsed %s %ss', ntokens, level) if ntokens == 0: raise ValueError('{}s expected but 0 parsed'.format(level)) # estimates token frequencies self.unigram = {} for level in self.separator.levels()[::-1]: self.unigram[level] = self._unigram(level)
def __init__(self, summary, pwb=None, threshold=0.5, log=utils.null_logger()): self.summary = summary self.wordsep = summary.separator.word self.log = log self.diphones = Counter() self.pwb = pwb if self.pwb and (self.pwb < 0 or self.pwb > 1): raise ValueError('pwb must be a float in [0, 1], it is: {}'.format( str(self.pwb))) self.thresh = threshold if self.thresh < 0 or self.thresh > 1: raise ValueError( 'threshold must be a float in [0, 1], it is: {}'.format( self.thresh)) self.init_diphones()
def segment(test_text, trained_model, type='phrasal', threshold=0.5, pwb=None, log=utils.null_logger()): """Segment a corpus from a trained DiBS model This method is a simple wrapper on the Segmenter classes, namely GoldSegmenter, PhrasalSegmenter and LexicalSegmenter. Parameters ---------- test_text : sequence of str The input text to segment is a sequence (list or generator) of utterances. Each utterance is composed of space seprated tokens (can be phones or syllables). trained_model : CorpusSummary The trained DiBS model used for segmentation of `test_text`. type : str, optional The type of DiBS segmenter to use, must be 'gold', 'phrasal' or 'lexical'. Default is 'phrasal'. threshold: float, optional Threshold on word boundary probabilities. If a diphone has a word boundray probability greater than this threshold, a word boudary is added. Must be in [0, 1]. The optimal threshold is 0.5 (default). pwb : float, optional Probability of word boundary, if not specified it is estimated from the train text as (nwords - nlines)/(nphones - nlines). This option is not used in 'gold' segmentation type. When defined must in [0, 1]. log : logging.Logger, optional The log instance where to send messages. Yields ------ utterance : str The current utterance segmented (with estimated word boundaries) Raises ------ ValueError: If `type` is not 'gold', 'phrasal' or 'lexical'. If `threshold` and `pwb` are not floats in [0, 1]. """ # retrieve the segmenter from the 'type' argument try: segmenter = { 'phrasal': PhrasalSegmenter, 'lexical': LexicalSegmenter, 'gold': GoldSegmenter }[type] except KeyError: raise ValueError( 'unknown segmenter {}, must be phrasal, lexical or gold'.format( type)) # init the segmenter with the trained model segmenter = segmenter(trained_model, pwb=pwb, threshold=threshold, log=log) for utt in test_text: yield segmenter.segment(utt)
def __init__(self, window=2, log=utils.null_logger()): self.log = log self.window = window self.lexicon = collections.Counter() self.beginning = collections.Counter() self.ending = collections.Counter()
def segment(text, threshold='relative', dependency='ftp', log=utils.null_logger()): """Returns a word segmented version of `text` using the TP algorithm Parameters ---------- text : sequence A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and complete utterance. threshold : str, optional Type of threshold to use, must be 'relative' or 'absolute'. dependency : str, optional Type of dependency measure to compute, must be 'ftp' for forward transitional probability, 'btp' for backward transitional probability or 'mi' for mutual information. log : logging.Logger, optional The logging instance where to send messages. Returns ------- list The utterances from `text` with estimated words boundaries. Raises ------ ValueError If `threshold` is not 'relative' or 'absolute'. If `dependency` is not 'ftp', 'btp' or 'mi'. """ # raise on invalid threshold type if threshold != 'relative' and threshold != 'absolute': raise ValueError( "invalid threshold, must be 'relative' or 'absolute', it is '{}'". format(threshold)) # raise on invalid probability type if dependency not in ('ftp', 'btp', 'mi'): raise ValueError("invalid dependency measure, must be 'ftp', 'btp' " "or 'mi', it is {}".format(dependency)) log.info('running TP with %s threshold and %s dependency measure', threshold, dependency) # join all the utterances together, seperated by ' UB ' units = [ unit for unit in ' UB '.join(line.strip() for line in text).split() ] # compute and count all the unigrams and bigrams (two successive units) unigrams = collections.Counter(units) bigrams = collections.Counter(zip(units[0:-1], units[1:])) # compute the transitional probabilities accordoing to the given # dependency measure if dependency == 'ftp': tps = { bigram: float(freq) / unigrams[bigram[0]] for bigram, freq in bigrams.items() } elif dependency == 'btp': tps = { bigram: float(freq) / unigrams[bigram[1]] for bigram, freq in bigrams.items() } else: # dependency == 'mi' tps = { bigram: math.log( float(freq) / (unigrams[bigram[0]] * unigrams[bigram[1]]), 2) for bigram, freq in bigrams.items() } # segment the input given the transition probalities cwords = (_threshold_relative(units, tps) if threshold == 'relative' else _threshold_absolute(units, tps)) # format the segment text for output (' UB ' -> '\n', remove # multiple spaces) segtext = ' '.join(''.join(c) for c in cwords) return [utt.strip() for utt in re.sub(' +', ' ', segtext).split('UB')]
def prepare(text, separator=Separator(), unit='phone', check_punctuation=True, tolerant=False, log=utils.null_logger()): """Prepares a text in phonological form for word segmentation The returned text is ready to be segmented. It consists in a suite of phonological symbols (can be phones or syllable depending on `unit`) separated by spaces. The function removes the word separators from all the lines in `text` and replaces boundaries at the unit level defined by `unit` by a space. If `unit` is 'phone' the syllable separators are removed, and vice-versa if `unit` is 'syllable' the phone separators are dicarded. Parameters ---------- text : sequence The input text to be prepared for segmentation. Each element of the sequence is assumed to be a single and complete utterance in valid phonological form. separator : Separator, optional Token separation in the `text` unit : str, optional The unit representation level to prepare the `text` at, must be 'syllable' or 'phone'. check_punctuation : bool, optional When True (default), forbid any punctuation character in the utterance and raise ValueError if any punctuation is found. When False, do not check punctiation. tolerant : bool, optional If False, raise ValueError on the first format error detected in the `text`. If True, the badly formated utterances are filtered out from the output and a warning is issued. log : logging.Logger, optional The logger instance where to send messages. Returns ------- prepared_text : generator Utterances from the `text` with separators removed, prepared for segmentation at a syllable or phoneme representation level (separated by space). Raises ------ ValueError On the first format error encountered in `text` (see the prepare.check_utterance function), only if `tolerant` is False. """ # raise an error if unit is not valid if unit != 'phone' and unit != 'syllable': raise ValueError( "unit must be 'phone' or 'syllable', it is '{}'".format(unit)) # define the function that prepare the text (removing requested # separators) if unit == 'phone': def func(line): return line.replace(separator.syllable, '')\ .replace(separator.word, '') else: # syllable def func(line): return line.replace(separator.word, '')\ .replace(' ', '')\ .replace(separator.syllable, ' ') nremoved = 0 for n, line in enumerate(text): try: # force the utf8 encoding line = line.encode('utf8').decode().strip() except ValueError: # line is already in bytes, not str line = line.strip() # ignore empty lines if line == '': log.debug('ignoring empty line %d', n + 1) nremoved += 1 continue try: check_utterance(line, separator, check_punctuation=check_punctuation) yield utils.strip(func(line)) except ValueError as err: if tolerant is True: log.info('removing line %d: "%s"', n + 1, line) nremoved += 1 else: raise ValueError('line {}: {}'.format(n + 1, err)) if nremoved > 0: log.warning('removed %d badly formatted utterances', nremoved)
def segment(text, train_text=None, threshold='relative', dependency='ftp', log=utils.null_logger()): """Returns a word segmented version of `text` using the TP algorithm The parameters `text` and `train_text` must be formatted as follows: A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and complete utterance Parameters ---------- text : sequence The text to segment into words train_text : sequence, optional The text used to train model on (estimation of transition probabilities). If not specified use the `text`. threshold : str, optional Type of threshold to use, must be 'relative' or 'absolute'. dependency : str, optional Type of dependency measure to compute, must be 'ftp' for forward transitional probability, 'btp' for backward transitional probability or 'mi' for mutual information. log : logging.Logger, optional The logging instance where to send messages. Returns ------- list The utterances from `text` with estimated words boundaries. Raises ------ ValueError If `threshold` is not 'relative' or 'absolute'. If `dependency` is not 'ftp', 'btp' or 'mi'. """ # raise on invalid threshold type if threshold not in ('relative', 'absolute'): raise ValueError( "invalid threshold, must be 'relative' or 'absolute', it is '{}'". format(threshold)) # raise on invalid probability type if dependency not in ('ftp', 'btp', 'mi'): raise ValueError("invalid dependency measure, must be 'ftp', 'btp' " "or 'mi', it is {}".format(dependency)) log.info('running TP with %s threshold and %s dependency measure', threshold, dependency) # calculate test_unit and train_unit test_units = ' UB '.join(line.strip() for line in text).split() if train_text is None: train_units = test_units else: train_units = ' UB '.join(line.strip() for line in train_text).split() # estimate the transition probabilities tps = _train(train_units, dependency) # segment the text using those TPs return _segment(test_units, tps, threshold)
def segment(text, train_text=None, grammar_file=None, category='Colloc0', args=DEFAULT_ARGS, save_grammar_to=None, ignore_first_parses=0, nruns=8, njobs=1, tempdir=tempfile.gettempdir(), log=utils.null_logger()): """Segment a text using the Adaptor Grammar algorithm The algorithm is ran 8 times in parallel and the results are collapsed. We ensure the random seed to be different for each run. Parameters ---------- text : sequence of str The list of utterances to segment using the model learned from `train_text`. train_text : sequence, optional The list of utterances to train the model on. If None train the model directly on `text`. grammar_file : str, optional The path to the grammar file to use for segmentation. If not specified, a Colloc0 grammar is generated from the input text. category : str, optional The category to segment the text with, must be an existing parent in the grammar (i.e. the `segment_category` must be present in the left column of the grammar file), default to 'Colloc0'. args : str, optional Command line options to run the AG program with, use 'wordseg-ag --help' to have a complete list of available options save_grammar_to : str, optional If defined, this is an output file where to save the grammar ussed for segmentation. This is usefull to keep trace of the used grammar when using an auto-generated one (i.e. when grammar_file is None). ignore_first_parses : int, optional Ignore the first parses from the algorithm output. If negative, keep only the last ones (e.g. -1 keeps only the last one, -2 the last two). nruns : int, optional number of runs to execute and output parses to collapse. This number 8 comes from the original recipe provided by M Jonhson. njobs : int, optional The number of parallel subprocesses to run tempdir : str, optional A directory where to store temporary data log : logging.Logger, optional A logger where to send log messages Returns ------- segmented : list The test utterances with estimated word boundaries Raises ------ RuntimeError If one of the AG subprocesses fails or returns an error code. If the `score_category` is not found in the grammar. """ t1 = datetime.datetime.now() # if any, force the test text from sequence to list test_text = text if not isinstance(test_text, list): test_text = list(test_text) nutts = len(test_text) log.info('test data: %s utterances loaded', nutts) if train_text is None: train_text = test_text log.info('not train data provided, will train model on test data') else: # force the train text from sequence to list if not isinstance(train_text, list): train_text = list(train_text) nutts = len(train_text) log.info('train data: %s utterances loaded', nutts) # display the AG algorithm parameters log.info('parameters are: "%s"', args) # setup ignore_first_parses and make sure ignore_first_parses <= # niterations if '-n' in args: nparses = int(re.sub(r'^.*\-n *([0-9]+).*$', r'\g<1>', args)) if '-x' in args: interval = int(re.sub(r'^.*\-x *([0-9]+).*$', r'\g<1>', args)) nparses = int(nparses / interval) nparses += 1 # include the initial one (independant of iterations) else: nparses = 2000 + 1 # the default value fixed in C++ if ignore_first_parses < 0: ignore_first_parses = max(0, nparses + ignore_first_parses) if ignore_first_parses >= nparses: raise RuntimeError('cannot ignore {} parses (max is {})'.format( ignore_first_parses, nparses - 1)) # ensure we have a different seed for all runs. If the seed is # specified in command line (-r SEED) then feed SEED+i for i the # ith run. Else put a random seed to each run. args = _setup_seed(args, nruns) log.info('random seeds are: %s', ', '.join([arg.split('-r ')[1].split(' ')[0] for arg in args])) # we may use a temp file to write the grammar, it is automatically # erased when done with tempfile.NamedTemporaryFile(dir=tempdir) as grammar_temp: # if grammar is not specified, generate a Colloc0 one from the # set of phones in the input text and write it in the tempfile if grammar_file is None: grammar_file = grammar_temp.name log.info('generating Colloc0 grammar in %s ...', grammar_file) # extract all the phones in both train and test data phones = set(p for utt in train_text for p in utt.split() if p) if test_text is not None: phones.update( set(p for utt in test_text for p in utt.split() if p)) # build the grammar from the phoneset grammar = build_colloc0_grammar(phones) codecs.open(grammar_file, 'w', encoding='utf8').write(grammar) check_grammar(grammar_file, category) log.info('valid grammar for level %s: %s', category, grammar_file) if save_grammar_to: log.info('saving grammar to %s', save_grammar_to) shutil.copyfile(grammar_file, save_grammar_to) # parallel runs of the AG algorithm log.info('running AG (%d times)...', nruns) parse_counter = ParseCounter(nutts) try: joblib.Parallel(n_jobs=njobs, backend="threading", verbose=0)(joblib.delayed(_segment_single)( parse_counter, train_text, grammar_file, category, ignore_first_parses, args[n], test_text=test_text, log_level=log.getEffectiveLevel(), tempdir=tempdir, log_name='wordseg-ag - run {}'.format(n + 1)) for n in range(nruns)) except joblib.JoblibRuntimeError as err: raise RuntimeError(err) t2 = datetime.datetime.now() log.info('total processing time: %s', t2 - t1) log.info('extracting most common utterances in %d parses', parse_counter.nparses) return parse_counter.most_common()
def segment(text, train_text=None, window=2, by_frequency=False, nfolds=5, njobs=1, log=utils.null_logger()): """Returns a word segmented version of `text` using the puddle algorithm Parameters ---------- text : sequence of str A sequence of lines with syllable (or phoneme) boundaries marked by spaces and no word boundaries. Each line in the sequence corresponds to a single and complete utterance. train_text : sequence of str The list of utterances to train the model on. If None (default) the model is trained online during segmentation. When `train_text` is specified, the options `nfolds` and `njobs` are ignored. window : int, optional Number of phonemes to be taken into account for boundary constraint. Default to 2. by_frequency : bool, optional When True choose the word candidates by filterring them by frequency. Default to False. nfolds : int, optional The number of folds to segment the `text` on. This option is ignored if a `train_text` is provided. njobs : int, optional The number of subprocesses to run in parallel. The folds are independant of each others and can be computed in parallel. Requesting a number of jobs greater then `nfolds` have no effect. This option is ignored if a `train_text` is provided. log : logging.Logger, optional The logger instance where to send messages. Returns ------- generator The utterances from `text` with estimated words boundaries. See also -------- wordseg.folding.fold """ # force the text to be a list of utterances text = list(text) if not train_text: log.info('not train data provided, will train model on test data') log.debug('building %s folds', nfolds) folded_texts, fold_index = folding.fold(text, nfolds) # segment the folds in parallel segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)( joblib.delayed(_do_puddle)(fold, window, by_frequency, log.getEffectiveLevel(), f'wordseg-puddle - fold {n+1}') for n, fold in enumerate(folded_texts)) log.debug('unfolding the %s folds', nfolds) output_text = folding.unfold(segmented_texts, fold_index) return (utt for utt in output_text if utt) # force the train text from sequence to list train_text = list(train_text) log.info('train data: %s utterances loaded', len(train_text)) # init a puddle model and train it model = Puddle(window=window, by_frequency=by_frequency, log=log) model.train(train_text) # segmentation of the test text, keeping the model constant return (utt for utt in model.segment(text, update_model=False) if utt)