Пример #1
0
def replace_config(config, name):
    # Do we have external stages?
    if ('external_stages_path' not in config
            and 'external_stages_modules' not in config):
        return streamcorpus_pipeline
    stages = PipelineStages()
    if 'external_stages_path' in config:
        path = config['external_stages_path']
        if not os.path.isabs(path) and config.get('root_path'):
            path = os.path.join(config['root_path'], path)
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            return streamcorpus_pipeline  # let check_config re-raise this
Пример #2
0
def replace_config(config, name):
    # Do we have external stages?
    if ('external_stages_path' not in config and
        'external_stages_modules' not in config):
        return streamcorpus_pipeline
    stages = PipelineStages()
    if 'external_stages_path' in config:
        path = config['external_stages_path']
        if not os.path.isabs(path) and config.get('root_path'):
            path = os.path.join(config['root_path'], path)
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            return streamcorpus_pipeline # let check_config re-raise this
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
Пример #4
0
def check_config(config, name):
    if 'tmp_dir_path' not in config:
        raise ConfigurationError(
            '{} requires tmp_dir_path setting'.format(name))

    # Checking stages:
    stages = PipelineStages()

    # (1) Push in the external stages;
    if 'external_stages_path' in config:
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            raise ConfigurationError(
                'invalid {} external_stages_path {}'.format(
                    name, config['external_stages_path']), e)
def test_pipeline(request, test_data_dir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
Пример #6
0
def check_config(config, name):
    if 'tmp_dir_path' not in config:
        raise ConfigurationError('{} requires tmp_dir_path setting'
                                 .format(name))

    # Checking stages:
    stages = PipelineStages()

    # (1) Push in the external stages; 
    if 'external_stages_path' in config:
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            raise ConfigurationError(
                'invalid {} external_stages_path {}'
                .format(name, config['external_stages_path']), e)
Пример #7
0
def test_dedup_chunk_counts(request, test_data_dir, tmpdir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   filename=filename,
                                   config={'tmp_dir_path':
                                           str(tmpdir)}) as config:
        ## run the pipeline
        pf = PipelineFactory(PipelineStages())
        p = pf(config['streamcorpus_pipeline'])
        p.run(get_test_chunk_path(test_data_dir))
Пример #8
0
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config,
                                            output_file):
    """set a publisher_type filter that matches nothing in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'MICROBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        # no chunks means the output file won't actually get written
        assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config,
                                         output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Пример #14
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='process a sequence of stream items',
        usage='streamcorpus_pipeline --config config.yaml --input file.in')
    parser.add_argument('-i', '--input', action='append', 
                        help='file paths to input instead of reading from stdin')
    parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files')
    parser.add_argument('--third-dir-path', help='path to third-party tools directory')
    parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large')

    modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()

    ## this modifies the global config, passed by reference
    instantiate_config(config)

    input_paths = []
    if args.in_glob:
        for pattern in args.in_glob:
            input_paths.extend(glob.glob(pattern))
    if args.input:
        if '-' in args.input:
            if args.in_glob:
                sys.exit('cannot use "-i -" and --in-glob together')
            if len(args.input) > 1:
                sys.exit('cannot use "-i -" with multiple inputs')
            input_paths = sys.stdin
        else:
            input_paths.extend(args.input)

    scp_config = config['streamcorpus_pipeline']
    stages = PipelineStages()
    if 'external_stages_path' in scp_config:
        stages.load_external_stages(scp_config['external_stages_path'])
    if 'external_stages_modules' in scp_config:
        for mod in scp_config['external_stages_modules']:
            stages.load_module_stages(mod)
    factory = PipelineFactory(stages)
    pipeline = factory(scp_config)

    for i_str in input_paths:
        work_unit = SimpleWorkUnit(i_str.strip())
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

    ## explicitly call cleanup, which is idempotent
    pipeline.cleanup()
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config,
                                            output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config,
                                     output_file):
    """minimal end-to-end test, preloading data in the loader"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        with open(filename, 'rb') as f:
            from_spinn3r_feed._prefetched[key] = f.read()
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Пример #17
0
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()
Пример #18
0
default_config = {
    'output_chunk_max_count': 500,
    'rate_log_interval': 100,
    'incremental_transforms': [],
    'batch_transforms': [],
    'post_batch_incremental_transforms': [],
    'cleanup_tmp_files': True,
    'assert_single_source': True,
    'reader': 'from_local_chunks',
    'writers': ['to_local_chunks'],
}
runtime_keys = {
    'tmp_dir_path': 'tmp_dir_path',
    'third_dir_path': 'third_dir_path',
}
sub_modules = set(stage for stage in PipelineStages().itervalues()
                  if hasattr(stage, 'config_name'))


def replace_config(config, name):
    # Do we have external stages?
    if ('external_stages_path' not in config
            and 'external_stages_modules' not in config):
        return streamcorpus_pipeline
    stages = PipelineStages()
    if 'external_stages_path' in config:
        path = config['external_stages_path']
        if not os.path.isabs(path) and config.get('root_path'):
            path = os.path.join(config['root_path'], path)
        try:
            stages.load_external_stages(config['external_stages_path'])