def test_prepare_bad_types(): # give dict or list of int as input, must fail with pytest.raises(AttributeError): list(prepare({1: 1, 2: 2})) with pytest.raises(AttributeError): list(prepare([1, 2], separator=Separator()))
def test_prepare_tolerant(): utterances = GOOD_UTTERANCES + BAD_UTTERANCES # tolerant=False with pytest.raises(ValueError): list(prepare(utterances, tolerant=False)) # tolerant=True prepared = list(prepare(utterances, tolerant=True)) assert len(prepared) == len(GOOD_UTTERANCES) assert prepared == list(prepare(GOOD_UTTERANCES))
def test_empty_lines(): text = ['', ''] assert len(list(prepare(text))) == 0 assert len(list(gold(text))) == 0 text = [ 'hh ax l ;esyll ow ;esyll ;eword', '', 'hh ax l ;esyll ow ;esyll ;eword' ] assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2 assert len(list(gold(text, separator=Separator()))) == 2
def test_replicate_cdswordseg(datadir): sep = Separator() _tags = [utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt] _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) _train = _tags[:200] model = dibs.CorpusSummary(_train) segmented = dibs.segment(_prepared, model) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases). You # can replicate this result in CDSWordseg using # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs" expected = { 'type_fscore': 0.2359, 'type_precision': 0.2084, 'type_recall': 0.2719, 'token_fscore': 0.239, 'token_precision': 0.3243, 'token_recall': 0.1892, 'boundary_all_fscore': 0.6543, 'boundary_all_precision': 0.8377, 'boundary_all_recall': 0.5367, 'boundary_noedge_fscore': 0.4804, 'boundary_noedge_precision': 0.7161, 'boundary_noedge_recall': 0.3614} assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = puddle.segment(_prepared, nfolds=1) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.06369, 'type_precision': 0.1075, 'type_recall': 0.04525, 'token_fscore': 0.06295, 'token_precision': 0.2056, 'token_recall': 0.03716, 'boundary_all_fscore': 0.4605, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.2991, 'boundary_noedge_fscore': 0.02806, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.01423 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = tp.segment(_prepared) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.304, 'type_precision': 0.2554, 'type_recall': 0.3756, 'token_fscore': 0.3994, 'token_precision': 0.3674, 'token_recall': 0.4375, 'boundary_all_fscore': 0.7174, 'boundary_all_precision': 0.6671, 'boundary_all_recall': 0.776, 'boundary_noedge_fscore': 0.6144, 'boundary_noedge_precision': 0.557, 'boundary_noedge_recall': 0.685 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate_cdswordseg(datadir): sep = Separator() # only the last 10 lines, for a fast test. We cannot take the 10 # first lines because they cause the dpseg_bugfix to correct a # fold (the implementation of that fix differs in CDS and wordseg, # so the results are not replicated exactly) _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][-10:] _prepared = prepare(_tags, separator=sep, unit='syllable') _gold = gold(_tags, separator=sep) uni_dmcmc_conf = [ c for c in wordseg.utils.get_config_files('dpseg') if 'uni_dmcmc' in c ][0] args = '--ngram 1 --a1 0 --b1 1 -C {}'.format(uni_dmcmc_conf) segmented = segment(_prepared, nfolds=5, njobs=4, args=args) score = evaluate(segmented, _gold) # we obtained that scores from the dpseg version in CDSWordSeg expected = { 'type_fscore': 0.3768, 'type_precision': 0.3939, 'type_recall': 0.3611, 'token_fscore': 0.3836, 'token_precision': 0.4118, 'token_recall': 0.359, 'boundary_all_fscore': 0.7957, 'boundary_all_precision': 0.8409, 'boundary_all_recall': 0.7551, 'boundary_noedge_fscore': 0.6415, 'boundary_noedge_precision': 0.7083, 'boundary_noedge_recall': 0.5862 } assert score == pytest.approx(expected, rel=1e-3)
def main(): """Entry point of the 'wordseg-dibs' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-dibs', description=__doc__, add_arguments=_add_arguments, train_file=True) # setup the separator from parsed arguments separator = Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) # load test text as a list of utterances, ignore empty lines test_text = [line for line in streamin if line] log.info('loaded %s utterances as test data', len(test_text)) # user provided a train text, ensure it is valid and that test_text does # not include word separators if args.train_file: if not os.path.isfile(args.train_file): raise ValueError( 'train file specified but does not exist: {}'.format( args.train_file)) # make sure test_text is in prepared form for n, line in enumerate(test_text): if separator.word in line: raise ValueError( f'word separator found in test text (line {n+1})') # load train and test texts, ignore empty lines train_text = codecs.open(args.train_file, 'r', encoding='utf8') train_text = [line for line in train_text if line] log.info('loaded %s utterances as train data', len(train_text)) else: log.info('using test data for training') # the presence of word separator in train utterance will be checked # during training train_text = test_text # remove the word separators for testing test_text = prepare(test_text) # train the model (learn diphone statistics) trained_model = CorpusSummary(train_text, separator=separator, level=args.unit, log=log) # segment the test text on the trained model segmented = segment(test_text, trained_model, type=args.type, threshold=args.threshold, pwb=args.pboundary, log=log) # output the segmented text streamout.write('\n'.join(segmented) + '\n') # save the computed diphones if required if args.diphones: log.info('saving %s diphones to %s', len(trained_model.diphones), args.diphones) output = ('{} {} {}'.format(v, k[0], k[1]) for k, v in sorted(trained_model.diphones.items(), key=operator.itemgetter(1), reverse=True)) codecs.open(args.diphones, 'w', encoding='utf8').write('\n'.join(output) + '\n')
def test_punctuation(utt): with pytest.raises(ValueError): list(prepare([utt], check_punctuation=True)) list(prepare([utt], check_punctuation=False))
def f(u): return list(prepare(p['raw'], separator=Separator(), unit=u))
# load the input text file text = open(sys.argv[1], 'r').readlines() # compute some statistics on the input text (text tokenized at phone # and word levels) separator = Separator(phone=' ', syllable=None, word=';eword') stats = CorpusStatistics(text, separator).describe_all() # display the computed statistics sys.stdout.write( '* Statistics\n\n' + json.dumps(stats, indent=4) + '\n') # prepare the input for segmentation prepared = list(prepare(text)) # generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs)