def test_aligner_separate(tmpdir, request, test_data_dir, third_dir):

    si = make_hyperlink_labeled_test_stream_item(test_data_dir)
    assert len(si.body.clean_visible) > 200
    # for x in si.body.labels['author']:
    #    print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id
    c_path = str(tmpdir.join("chunk.sc"))
    chunk = streamcorpus.Chunk(c_path, mode="wb")
    chunk.add(si)
    chunk.close()

    lp = lingpipe(
        config={
            "tmp_dir_path": str(tmpdir),
            "exit_code_on_out_of_memory": 1,
            "third_dir_path": third_dir,
            "path_in_third": "lingpipe-4.10",
            "offset_types": ["BYTES"],
            "offset_debugging": True,
            "cleanup_tmp_files": False,
        }
    )
    lp.process_path(c_path)
    assert tmpdir.join("chunk.sc-clean_visible.xml").read()
    assert tmpdir.join("chunk.sc-ner.xml").read()

    ## run the aligner separately
    aligner = byte_offset_align_labels(config={"annotator_id": "author", "tagger_id": "lingpipe"})
    aligner.process_path(c_path)

    ## verify that we get the same answer as test above
    si = list(streamcorpus.Chunk(c_path))[0]
    assert len(si.body.clean_visible) > 200
    assert len(si.body.sentences["lingpipe"]) == 41
def test_aligner_separate(tmpdir, request, test_data_dir, third_dir):

    si = make_hyperlink_labeled_test_stream_item(test_data_dir)
    assert len(si.body.clean_visible) > 200
    #for x in si.body.labels['author']:
    #    print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id
    c_path = str(tmpdir.join('chunk.sc'))
    chunk = streamcorpus.Chunk(c_path, mode='wb')
    chunk.add(si)
    chunk.close()

    lp = lingpipe(
        config={
            'tmp_dir_path': str(tmpdir),
            'exit_code_on_out_of_memory': 1,
            'third_dir_path': third_dir,
            'path_in_third': 'lingpipe-4.10',
            'offset_types': ['BYTES'],
            'offset_debugging': True,
            'cleanup_tmp_files': False,
        })
    lp.process_path(c_path)
    assert tmpdir.join('chunk.sc-clean_visible.xml').read()
    assert tmpdir.join('chunk.sc-ner.xml').read()

    ## run the aligner separately
    aligner = byte_offset_align_labels(config={
        'annotator_id': 'author',
        'tagger_id': 'lingpipe',
    })
    aligner.process_path(c_path)

    ## verify that we get the same answer as test above
    si = list(streamcorpus.Chunk(c_path))[0]
    assert len(si.body.clean_visible) > 200
    assert len(si.body.sentences['lingpipe']) == 41
def test_byte_offset_align_labels():
    config = _test_config()
    transform = byte_offset_align_labels(config)
    _test_tagger_transform(transform)