Exemplo n.º 1
0
  def __init__(self, injector=None, config_path=None):
    if not injector:
      import gransk.core.injector as _injector
      injector = _injector.Injector()

    code_root = os.path.realpath(
          os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))

    if not config_path:
      config_path = os.path.join(code_root, 'config.yml')

    with open(config_path) as inp:
      self.config = yaml.load(inp.read())

    self.config['worker_id'] = 0
    self.config['injector'] = injector
    self.config['injector'].set_config(self.config)

    self.config[helper.CODE_ROOT] = code_root
    self.config[helper.DATA_ROOT] = os.path.join(code_root, 'local_data')
    self.config[helper.WORKERS] = 1
    self.config[helper.TAG] = 'default'
    self.config[helper.MAX_FILE_SIZE] = getattr(self.config, helper.MAX_FILE_SIZE, 0)
    self.config[helper.SUBSCRIBERS].extend([
      'gransk.core.detect_type'
    ])

    self.pipeline = pipeline.build_pipeline(self.config)
    self.entrypoint = Subscriber(self.pipeline)
    self.entrypoint.setup(self.config)
Exemplo n.º 2
0
    def test_build_pipeline(self):
        config = {'subscribers': ['gransk.core.detect_type']}

        pipe = pipeline.build_pipeline(config)

        self.assertEquals(len(pipe.subscribers), 1)

        pipe.stop()
Exemplo n.º 3
0
    def init(self, config, queue, worker_id, injector):
        """
    Initialize worker and read paths from queue, stopping when queue is empty.

    :param config: Configuration object.
    :param queue: Multiprcessing Queue object.
    :param worker_id: Value identifying this worker.
    :param injector: Object from which to fetch dependencies.
    :type config: ``dict``
    :type queue: ``multiprocessing.Queue``
    :type worker_id: ``int``
    :type injector: ``gransk.core.injector.Injector``
    """
        logger = logging.getLogger('worker')

        config[helper.WORKER_ID] = worker_id
        config[helper.INJECTOR] = injector

        pipe = pipeline.build_pipeline(config)

        mod = gransk.api.Subscriber(pipe)
        mod.setup(config)

        while True:
            try:
                path = queue.get(timeout=1)
            except Empty:
                logger.info('[normal stop] worker %d', worker_id)
                break

            try:
                doc = document.get_document(
                    path, parent=document.get_document('root'))
                mod.consume(doc)
            except KeyboardInterrupt:
                logger.info('[aborting] worker %d', worker_id)
                break

        pipe.stop()

        with open(
                os.path.join(config[helper.DATA_ROOT],
                             'time-%s.csv' % worker_id), 'w') as out:
            out.write('%s;%s;%s;%s\n' %
                      ('consumer', 'total', 'consume_count', 'avg'))
            for consumer, (total, consume_count,
                           avg) in pipe.get_time_report():
                out.write('%s;%.2f;%.2f;%.2f\n' %
                          (consumer, total, consume_count, avg))