def __init__(self, injector=None, config_path=None): if not injector: import gransk.core.injector as _injector injector = _injector.Injector() code_root = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) if not config_path: config_path = os.path.join(code_root, 'config.yml') with open(config_path) as inp: self.config = yaml.load(inp.read()) self.config['worker_id'] = 0 self.config['injector'] = injector self.config['injector'].set_config(self.config) self.config[helper.CODE_ROOT] = code_root self.config[helper.DATA_ROOT] = os.path.join(code_root, 'local_data') self.config[helper.WORKERS] = 1 self.config[helper.TAG] = 'default' self.config[helper.MAX_FILE_SIZE] = getattr(self.config, helper.MAX_FILE_SIZE, 0) self.config[helper.SUBSCRIBERS].extend([ 'gransk.core.detect_type' ]) self.pipeline = pipeline.build_pipeline(self.config) self.entrypoint = Subscriber(self.pipeline) self.entrypoint.setup(self.config)
def test_build_pipeline(self): config = {'subscribers': ['gransk.core.detect_type']} pipe = pipeline.build_pipeline(config) self.assertEquals(len(pipe.subscribers), 1) pipe.stop()
def init(self, config, queue, worker_id, injector): """ Initialize worker and read paths from queue, stopping when queue is empty. :param config: Configuration object. :param queue: Multiprcessing Queue object. :param worker_id: Value identifying this worker. :param injector: Object from which to fetch dependencies. :type config: ``dict`` :type queue: ``multiprocessing.Queue`` :type worker_id: ``int`` :type injector: ``gransk.core.injector.Injector`` """ logger = logging.getLogger('worker') config[helper.WORKER_ID] = worker_id config[helper.INJECTOR] = injector pipe = pipeline.build_pipeline(config) mod = gransk.api.Subscriber(pipe) mod.setup(config) while True: try: path = queue.get(timeout=1) except Empty: logger.info('[normal stop] worker %d', worker_id) break try: doc = document.get_document( path, parent=document.get_document('root')) mod.consume(doc) except KeyboardInterrupt: logger.info('[aborting] worker %d', worker_id) break pipe.stop() with open( os.path.join(config[helper.DATA_ROOT], 'time-%s.csv' % worker_id), 'w') as out: out.write('%s;%s;%s;%s\n' % ('consumer', 'total', 'consume_count', 'avg')) for consumer, (total, consume_count, avg) in pipe.get_time_report(): out.write('%s;%.2f;%.2f;%.2f\n' % (consumer, total, consume_count, avg))