def test_sgml_reconstruction(): reference_dataset = '/home/timv/projects/data/citations/tagged_references.txt' with file(reference_dataset, 'r') as f: for sgml in line_groups(f.read(), '<NEW.*?>'): (labels, tokens) = zip(*sgml2bio(sgml)) # convert spans to sgml spans = bio2span(labels) reconstructed = ' '.join('<%s>%s</%s>' % (l, ' '.join(tokens[b:e]), l) for (l,b,e) in spans) assert equals_mod_whitespace(reconstructed, sgml), \ 'reconstructed example should only differ in whitespace.' print 'passed sgml reconstruction test.'
def test_bio2span(): tests = [ (['I-NUM','I-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]), (['I-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]), (['B-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]), (['B-NUM','B-TEMP'], [Span(label='NUM', begins=0, ends=1), Span(label='TEMP', begins=1, ends=2)]), (['B-NUM','O'], [Span(label='NUM', begins=0, ends=1)]), (['O','B-NUM'], [Span(label='NUM', begins=1, ends=2)]), (['O','B-NUM','O'], [Span(label='NUM', begins=1, ends=2)]), (['O','B-NUM','I-NUM'], [Span(label='NUM', begins=1, ends=3)]), ( ['O', 'O', 'O', 'I-NUM', 'I-NUM', 'I-NUM', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'O', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], [Span(label='NUM', begins=3, ends=6), Span(label='TEMP', begins=6, ends=14), Span(label='NUM', begins=15, ends=23) ] ), ( ['O', 'O', 'O', 'O', 'O', 'O', 'I-TEMP', 'I-NUM', 'I-NUM', 'I-NUM', 'O', 'I-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-TEMP', 'I-TEMP', 'I-TEMP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], [Span(label='TEMP', begins=6, ends=7), Span(label='NUM', begins=7, ends=10), Span(label='NUM', begins=11, ends=15), Span(label='TEMP', begins=23, ends=26), ] ), ( ['O', 'O', 'O', 'B-NUM', 'I-NUM', 'I-NUM', 'I-NUM' , 'B-NUM', 'I-NUM', 'I-NUM', 'I-NUM', 'O', 'B-NUM', 'I-NUM', 'I-NUM', 'O', 'O', 'O', 'B-TEMP', 'I-TEMP', 'O'], [Span(label='NUM', begins=3, ends=7), Span(label='NUM', begins=7, ends=11), Span(label='NUM', begins=12, ends=15), Span(label='TEMP', begins=18, ends=20), ] ), ] for x, expected in tests: print x got = bio2span(x, include_O=False) if expected == got: print '\033[32mpassed\033[0m' else: print '\033[31mfailed:\033[0m' print ' expected:', expected print ' got:', got print print '*************************************************************************' print '** Including O Spans.' include_O = [ (['O','B-NUM','I-DATE'], [Span(label='O', begins=0, ends=1), Span(label='NUM', begins=1, ends=2), Span(label='DATE', begins=2, ends=3)] ), (['O','B-NUM','O','I-DATE'], [Span(label='O', begins=0, ends=1), Span(label='NUM', begins=1, ends=2), Span(label='O', begins=2, ends=3), Span(label='DATE', begins=3, ends=4)] ), (['O','B-NUM','O','I-DATE', 'O', 'O'], [Span(label='O', begins=0, ends=1), Span(label='NUM', begins=1, ends=2), Span(label='O', begins=2, ends=3), Span(label='DATE', begins=3, ends=4), Span(label='O', begins=4, ends=5), Span(label='O', begins=5, ends=6)] ), (['O'], [Span(label='O', begins=0, ends=1)] ), ] for x, expected in include_O: print x got = bio2span(x, include_O=True) if expected == got: print '\033[32mpassed\033[0m' else: print '\033[31mfailed:\033[0m' print ' expected:', expected print ' got:', got raise AssertionError print print 'passed test_bio2span'