示例#1
0
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config',
        type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--hostname',
        type=str,
        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help=
        'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument(
        '--port',
        type=int,
        help=
        'Base port number, server will bind to 6 ports starting from base. Default is 5550'
    )
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
    def __init__(self):
        settings = Settings()
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type='sw')
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type='db')
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument('--config', type=str,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--hostname', type=str,
                        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument('--port', type=int,
                        help='Base port number, server will bind to 6 ports starting from base. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
    def __init__(self):
        settings = Settings()
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type='sw')
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type='db')
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
示例#6
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
示例#7
0
                        help='Disables periodical generation of new batches.')
    parser.add_argument('--no-incoming',
                        action='store_true',
                        help='Disables periodical incoming topic consumption.')
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port',
                        type=int,
                        help="Json Rpc service port to listen.")
    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    worker = FrontierWorker(settings, args.no_batches, args.no_incoming)
    server = WorkerJsonRpcService(worker, settings)
    server.start_listening()
    worker.run()
def test_override():
    s = Settings()
    assert s.get("SPIDER_FEED_PARTITIONS") == 2
def test_frontera():
    s = Settings()
    assert s.get("TEST_MODE") is not None
    assert s.get("MAX_REQUESTS") is not None
示例#10
0
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera DB worker.")
    parser.add_argument('--no-batches', action='store_true',
                        help='Disables periodical generation of new batches.')
    parser.add_argument('--no-incoming', action='store_true',
                        help='Disables periodical incoming topic consumption.')
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    worker = FrontierWorker(settings, args.no_batches, args.no_incoming)
    server = WorkerJsonRpcService(worker, settings)
    server.start_listening()
    worker.run()

示例#11
0
            encoded = self._encoder.encode_update_score(
                request.meta['fingerprint'], score, request.url, False)
            return [encoded]
        return []


if __name__ == '__main__':
    parser = ArgumentParser(description="Crawl frontier scoring worker.")
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        required=True,
                        help='Crawling strategy module name')

    args = parser.parse_args()
    logger.setLevel(args.log_level)
    settings = Settings(module=args.config)
    strategy_module = import_module(args.strategy)
    worker = ScoringWorker(settings, strategy_module)
    worker.run()
示例#12
0
def test_override():
    s = Settings()
    assert s.get("SPIDER_FEED_PARTITIONS") == 2
示例#13
0
def test_instance_attrs():
    s = Settings(attributes={"XYZ": "hey"})
    assert s.attributes["XYZ"] == "hey"
示例#14
0
def test_frontera():
    s = Settings()
    assert s.get("TEST_MODE") is not None
    assert s.get("MAX_REQUESTS") is not None