def instantiate_config(config): """setup the config and load external modules """ make_absolute_paths(config) pipeline_config = config["streamcorpus_pipeline"] pipeline_config["config_hash"] = make_hash(config) pipeline_config["config_json"] = json.dumps(config) ## setup loggers reset_log_level(pipeline_config.get("log_level", "DEBUG")) logger.warn("running config: %s = %s" % (pipeline_config["config_hash"], config)) logger.info(json.dumps(config, indent=4, sort_keys=True)) ## Load modules # This is a method of using settings in yaml configs to load plugins. die = False for pathstr in pipeline_config.get("pythonpath", {}).itervalues(): if pathstr not in sys.path: sys.path.append(pathstr) for modname in pipeline_config.get("setup_modules", {}).itervalues(): try: m = importlib.import_module(modname) if not m: logger.error("could not load module %r", modname) die = True continue if hasattr(m, "setup"): m.setup() logger.debug("loaded and setup %r", modname) else: logger.debug("loaded %r", modname) except: logger.error("error loading and initting module %r", modname, exc_info=True) die = True if die: sys.exit(1)
def test_kvlayer_extractor_and_loader(config): path = get_test_v0_3_0_chunk_path() loader = to_kvlayer(config) ## name_info and i_str are not used by the loader i_str = '' name_info = {} loader(path, name_info, i_str) ## check that index table was created all_doc_ids = set() all_epoch_ticks = set() for (doc_id, epoch_ticks), empty_data in loader.client.scan('stream_items_doc_id_epoch_ticks'): all_doc_ids.add(doc_id) all_epoch_ticks.add(epoch_ticks) all_doc_ids = sorted(all_doc_ids) all_epoch_ticks = sorted(all_epoch_ticks) logger.info('%d doc_ids', len(all_doc_ids)) ## make an extractor extractor = from_kvlayer(config) ## test it with different i_str inputs: for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0], all_doc_ids[0], all_epoch_ticks[-1], all_doc_ids[-1]) ]: stream_ids = [] for si in extractor(i_str): stream_ids.append(si.stream_id) _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)] input_chunk_ids = list(set(_input_chunk_ids)) logger.info('%d inserts, %d unique', len(_input_chunk_ids), len(input_chunk_ids)) input_chunk_ids.sort() stream_ids.sort() assert len(input_chunk_ids) == len(stream_ids) assert input_chunk_ids == stream_ids