def test_aligner_separate(tmpdir, request, test_data_dir, third_dir): si = make_hyperlink_labeled_test_stream_item(test_data_dir) assert len(si.body.clean_visible) > 200 # for x in si.body.labels['author']: # print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id c_path = str(tmpdir.join("chunk.sc")) chunk = streamcorpus.Chunk(c_path, mode="wb") chunk.add(si) chunk.close() lp = lingpipe( config={ "tmp_dir_path": str(tmpdir), "exit_code_on_out_of_memory": 1, "third_dir_path": third_dir, "path_in_third": "lingpipe-4.10", "offset_types": ["BYTES"], "offset_debugging": True, "cleanup_tmp_files": False, } ) lp.process_path(c_path) assert tmpdir.join("chunk.sc-clean_visible.xml").read() assert tmpdir.join("chunk.sc-ner.xml").read() ## run the aligner separately aligner = byte_offset_align_labels(config={"annotator_id": "author", "tagger_id": "lingpipe"}) aligner.process_path(c_path) ## verify that we get the same answer as test above si = list(streamcorpus.Chunk(c_path))[0] assert len(si.body.clean_visible) > 200 assert len(si.body.sentences["lingpipe"]) == 41
def test_aligner_separate(tmpdir, request, test_data_dir, third_dir): si = make_hyperlink_labeled_test_stream_item(test_data_dir) assert len(si.body.clean_visible) > 200 #for x in si.body.labels['author']: # print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id c_path = str(tmpdir.join('chunk.sc')) chunk = streamcorpus.Chunk(c_path, mode='wb') chunk.add(si) chunk.close() lp = lingpipe( config={ 'tmp_dir_path': str(tmpdir), 'exit_code_on_out_of_memory': 1, 'third_dir_path': third_dir, 'path_in_third': 'lingpipe-4.10', 'offset_types': ['BYTES'], 'offset_debugging': True, 'cleanup_tmp_files': False, }) lp.process_path(c_path) assert tmpdir.join('chunk.sc-clean_visible.xml').read() assert tmpdir.join('chunk.sc-ner.xml').read() ## run the aligner separately aligner = byte_offset_align_labels(config={ 'annotator_id': 'author', 'tagger_id': 'lingpipe', }) aligner.process_path(c_path) ## verify that we get the same answer as test above si = list(streamcorpus.Chunk(c_path))[0] assert len(si.body.clean_visible) > 200 assert len(si.body.sentences['lingpipe']) == 41
def test_byte_offset_align_labels(): config = _test_config() transform = byte_offset_align_labels(config) _test_tagger_transform(transform)