示例#1
0
def instantiate_config(config):
    """setup the config and load external modules
    """
    make_absolute_paths(config)

    pipeline_config = config["streamcorpus_pipeline"]

    pipeline_config["config_hash"] = make_hash(config)
    pipeline_config["config_json"] = json.dumps(config)

    ## setup loggers
    reset_log_level(pipeline_config.get("log_level", "DEBUG"))

    logger.warn("running config: %s = %s" % (pipeline_config["config_hash"], config))

    logger.info(json.dumps(config, indent=4, sort_keys=True))

    ## Load modules
    # This is a method of using settings in yaml configs to load plugins.
    die = False
    for pathstr in pipeline_config.get("pythonpath", {}).itervalues():
        if pathstr not in sys.path:
            sys.path.append(pathstr)
    for modname in pipeline_config.get("setup_modules", {}).itervalues():
        try:
            m = importlib.import_module(modname)
            if not m:
                logger.error("could not load module %r", modname)
                die = True
                continue
            if hasattr(m, "setup"):
                m.setup()
                logger.debug("loaded and setup %r", modname)
            else:
                logger.debug("loaded %r", modname)
        except:
            logger.error("error loading and initting module %r", modname, exc_info=True)
            die = True
    if die:
        sys.exit(1)
def test_kvlayer_extractor_and_loader(config):
    path = get_test_v0_3_0_chunk_path()
    loader = to_kvlayer(config)
    
    ## name_info and i_str are not used by the loader
    i_str = ''
    name_info = {}
    loader(path, name_info, i_str)

    ## check that index table was created
    all_doc_ids = set()
    all_epoch_ticks = set()
    for (doc_id, epoch_ticks), empty_data in loader.client.scan('stream_items_doc_id_epoch_ticks'):
        all_doc_ids.add(doc_id)
        all_epoch_ticks.add(epoch_ticks)
    all_doc_ids = sorted(all_doc_ids)
    all_epoch_ticks = sorted(all_epoch_ticks)
    logger.info('%d doc_ids', len(all_doc_ids))

    ## make an extractor
    extractor = from_kvlayer(config)

    ## test it with different i_str inputs:
    for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0],  all_doc_ids[0],
                                                          all_epoch_ticks[-1], all_doc_ids[-1]) ]:
        stream_ids = []
        for si in extractor(i_str):
            stream_ids.append(si.stream_id)    
        _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)]
        input_chunk_ids = list(set(_input_chunk_ids))
        logger.info('%d inserts, %d unique',
                    len(_input_chunk_ids), len(input_chunk_ids))
        input_chunk_ids.sort()
        stream_ids.sort()
        assert len(input_chunk_ids) == len(stream_ids)
        assert input_chunk_ids == stream_ids