def test_pipeline(request, test_data_dir):
    filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
示例#2
0
def make_hyperlink_labeled_test_chunk(tmpdir):
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = tmpdir.join(str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [BYTES],
    })
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def test_pipeline(request, test_data_dir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
def test_upgrade_streamcorpus_v0_3_0(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    count = 0

    for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir), message=streamcorpus.StreamItem_v0_2_0):
        count += 1
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"]
        if count > 10:
            break
def test_dedup_chunk_counts(request, test_data_dir, tmpdir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   filename=filename,
                                   config={'tmp_dir_path': str(tmpdir)}
    ) as config:
        ## run the pipeline
        pf = PipelineFactory(PipelineStages())
        p = pf(config['streamcorpus_pipeline'])
        p.run(get_test_chunk_path(test_data_dir))
示例#6
0
def test_dedup_chunk_counts(request, test_data_dir, tmpdir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   filename=filename,
                                   config={'tmp_dir_path':
                                           str(tmpdir)}) as config:
        ## run the pipeline
        pf = PipelineFactory(PipelineStages())
        p = pf(config['streamcorpus_pipeline'])
        p.run(get_test_chunk_path(test_data_dir))
def test_post_batch_incremental_stage(request, test_data_dir):
    path = os.path.dirname(__file__)
    config = yaml.load(open(os.path.join(path, 'test_post_batch_incremental.yaml')))

    ## config says read from stdin, so make that have what we want
    stdin = sys.stdin
    sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

    ## run the pipeline
    p = Pipeline( config )
    p.run()
示例#8
0
def test_upgrade_streamcorpus_v0_3_0(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    count = 0

    for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir),
                                 message=streamcorpus.StreamItem_v0_2_0):
        count += 1
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0']
        if count > 10:
            break
def test_post_batch_incremental_stage(request, test_data_dir):
    path = os.path.dirname(__file__)
    config = yaml.load(
        open(os.path.join(path, 'test_post_batch_incremental.yaml')))

    ## config says read from stdin, so make that have what we want
    stdin = sys.stdin
    sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

    ## run the pipeline
    p = Pipeline(config)
    p.run()
def make_hyperlink_labeled_test_chunk(tmpdir):
    """
    returns a path to a temporary chunk that has been hyperlink labeled
    """
    tpath = tmpdir.join(str(uuid.uuid1()) + ".sc")
    o_chunk = Chunk(tpath, mode="wb")

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]})
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath