Пример #1
0
def make_absolute_paths(config):
    if not "streamcorpus_pipeline" in config:
        logger.critical("bad config: %r", config)
        raise ConfigurationError('missing "streamcorpus_pipeline" from config')
    ## remove the root_path, so it does not get extended itself
    root_path = config["streamcorpus_pipeline"].pop("root_path", None)
    if not root_path:
        root_path = os.getcwd()

    if not root_path.startswith("/"):
        root_path = os.path.join(os.getcwd(), root_path)

    def recursive_abs_path(sub_config, root_path):
        for key, val in sub_config.items():
            if isinstance(val, basestring):
                if key.endswith("path"):
                    ## we have a path... is it already absolute?
                    if not val.startswith("/"):
                        ## make the path absolute
                        sub_config[key] = os.path.join(root_path, val)

            elif isinstance(val, dict):
                recursive_abs_path(val, root_path)

    recursive_abs_path(config, root_path)

    ## put the root_path back
    config["root_path"] = root_path
def test_aligner_separate(tmp_dir_path):


    si = make_hyperlink_labeled_test_stream_item()
    assert len(si.body.clean_visible) > 200
    #for x in si.body.labels['author']:
    #    print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id
    c_path = os.path.join(tmp_dir_path, 'chunk.sc')
    chunk = streamcorpus.Chunk(c_path, mode='wb')
    chunk.add(si)
    chunk.close()

    lp = streamcorpus_pipeline.stages._init_stage(
        'lingpipe', dict(
            tmp_dir_path = tmp_dir_path,
            exit_code_on_out_of_memory=1,
            pipeline_root_path=os.path.join(os.path.dirname(__file__), '../../../third/'),
            offset_types = ['BYTES'],
            offset_debugging = True,
            cleanup_tmp_files = False,
        )
    )
    logger.critical(c_path)
    lp.process_path(c_path)
    assert os.path.exists(tmp_dir_path)
    assert open(os.path.join(tmp_dir_path, 'chunk.sc-clean_visible.xml')).read()
    assert open(os.path.join(tmp_dir_path, 'chunk.sc-ner.xml')).read()
    logger.critical(os.listdir(tmp_dir_path))

    ## run the aligner separately
    aligner = streamcorpus_pipeline.stages._init_stage(
        'byte_offset_align_labels',
        dict(
            annotator_id = 'author',
            tagger_id = 'lingpipe'
        )
    )
    aligner.process_path(c_path)

    ## verify that we get the same answer as test above
    si = list(streamcorpus.Chunk(c_path))[0]
    logger.critical('%d bytes clean_visible for %s', len(si.body.clean_visible), si.stream_id)
    assert len(si.body.clean_visible) > 200
    logger.critical('%d sentences for %s', len(si.body.sentences['lingpipe']), si.stream_id)
    assert len(si.body.sentences['lingpipe']) == 41, open(os.path.join(tmp_dir_path, 'chunk.sc-ner.xml')).read()
    assert os.path.exists(tmp_dir_path)
def test_pipeline_run(cmd_expect_success):
    cmd, expect_success = cmd_expect_success    
    logger.critical(cmd)
    p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
    ret = None
    start_time = time.time()
    max_time = 900
    elapsed = 0
    while elapsed < max_time:
        elapsed = time.time() - start_time
        ret = p.poll()
        if ret is not None:
            break
        out, err = p.communicate()
        logger.critical( out )
        logger.critical( err )

    if elapsed >= max_time:
        raise Exception('timed out after %d seconds' % (time.time() - start_time))

    if expect_success:
        assert ret == 0
    else:
        assert ret != 0