class WSClientTransport(WebSocketClient): APP_FACTORY = None def __init__(self, url): self._close_event = Event() # patch socket.sendall to protect it with lock, # in order to prevent sending data from multiple greenlets concurrently WebSocketClient.__init__(self, url) self._app = None self._lock = RLock() _sendall = self.sock.sendall def sendall(data): self._lock.acquire() try: _sendall(data) except: raise finally: self._lock.release() self.sock.sendall = sendall def connect(self): super(WSClientTransport, self).connect() self._app = self.APP_FACTORY(self) log.info("Connected to websocket server {0}".format(self.url)) def closed(self, code, reason=None): app, self._app = self._app, None if app: app.on_close() self._close_event.set() def ponged(self, pong): pass def received_message(self, message): log.debug("Received message {0}".format(message)) if self._app: self._app.on_received_packet(STRING(message)) else: log.warning('Websocket client app already closed') def send_packet(self, data): log.debug("Sending message {0}".format(data)) self.send(data) def force_shutdown(self): # called by the upper layer, and no callback will be possible when closed self._app = None self.close() self._close_event.set() log.info('Websocket client closed') def wait_close(self): self._close_event.wait() def app(self): return self._app
class WSServerTransport(WebSocket): APP_FACTORY = None def __init__(self, *args, **kwargs): super(WSServerTransport, self).__init__(*args, **kwargs) self._app = None def opened(self): # patch socket.sendall to protect it with lock, # in order to prevent sending data from multiple greenlets concurrently self._lock = RLock() _sendall = self.sock.sendall def sendall(data): self._lock.acquire() try: _sendall(data) except: raise finally: self._lock.release() self.sock.sendall = sendall # create app if not self.environ.get('QUERY_STRING'): query = {} else: query = urlparse.parse_qs(self.environ['QUERY_STRING']) for key, value in query.iteritems(): query[key] = value[0] self._app = self.APP_FACTORY(self, query) def closed(self, code, reason=None): app, self._app = self._app, None if app: app.on_close() def ponged(self, pong): pass def received_message(self, message): log.debug("Received message {0}".format(message)) self._app.on_received_packet(STRING(message)) def send_packet(self, data): log.debug("Sending message {0}".format(data)) self.send(data) def force_shutdown(self): # called by the upper layer, and no callback will be possible when closed log.info("shutdown") self._app = None self.close()
class ThreadSafeFSM(InstrumentFSM): def __init__(self, states, events, enter_event, exit_event): self._lock = RLock() super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event) def on_event(self, event, *args, **kwargs): with self._lock: return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs) def on_event_if_free(self, event, *args, **kwargs): if not self._lock.acquire(blocking=False): raise FSMLockedError try: retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs) finally: self._lock.release() return retval
def opened(self): # patch socket.sendall to protect it with lock, # in order to prevent sending data from multiple greenlets concurrently self._lock = RLock() _sendall = self.sock.sendall def sendall(data): self._lock.acquire() try: _sendall(data) except: raise finally: self._lock.release() self.sock.sendall = sendall # create app try: if not self.environ.get('QUERY_STRING'): query = {} else: query = urlparse.parse_qs(self.environ['QUERY_STRING']) for key, value in query.iteritems(): query[key] = value[0] self._app = self.APP_FACTORY(self, query) except Exception: log.exception('Failed to create websocket app') raise
def __init__(self, config): super(SudsSOAPWrapper, self).__init__(config) self.update_lock = RLock() self.config = config self.config_no_sensitive = deepcopy(self.config) self.config_no_sensitive['password'] = '******' self.client = _SudsClientQueue(Queue(self.config['pool_size']), self.config['name'])
def _check_channel_lock(self, partner: typing.Address): if partner not in self.channel_operations_lock: self.channel_operations_lock[partner] = RLock() if not self.channel_operations_lock[partner].acquire(blocking=False): raise ChannelBusyError( f'Channel between {self.node_address} and {partner} is ' f'busy with another ongoing operation.', )
def __init__(self, amqp_url='amqp:///', heartbeat=30, debug=False): super(BaseConnection, self).__init__() self.channel_id = 0 self.channels = {} self.connect_lock = RLock() self.channels_lock = RLock() self.queue = None self.state = STATE_DISCONNECTED self.disconnect_event = Event() self.debug = debug # Negotiate for heartbeats self.requested_heartbeat = heartbeat (self.username, self.password, self.vhost, self.host, self.port) = \ parse_amqp_url(str(amqp_url))
def __init__(self): self.lock = RLock() self.topic_sub_key_to_msg_id = {} # Topic ID -> Sub key -> Msg ID self.topic_msg_id_to_msg = {} # Topic ID -> Msg ID -> Message data self.msg_id_to_expiration = {} # Msg ID -> (Topic ID, sub_keys, expiration time in milliseconds) # Start in background a cleanup task that removes all expired messages spawn_greenlet(self.run_cleanup_task)
def __init__(self, config): self.config = config self.url = '{protocol}://{user}:******@{host}:{port}/{database}'.format(**self.config) self.client = ConnectionQueue( self.config.pool_size, self.config.queue_build_cap, self.config.name, 'Odoo', self.url, self.add_client) self.update_lock = RLock() self.logger = getLogger(self.__class__.__name__)
def __init__(self, services=None, service_store_config=None, odb=None): self.services = services self.service_store_config = service_store_config self.odb = odb self.id_to_impl_name = {} self.impl_name_to_id = {} self.name_to_impl_name = {} self.update_lock = RLock()
def __init__(self, proxy, runner): self.proxy = proxy self.next = None self.prev = None self.task_id = JobService.instance().generateUniqueID() self.runner = runner self.profile = Task.Profile() self.locker = RLock()
def __init__(self, canvas, addr): self.canvas = canvas self.socket = None self.addr = addr self.connect_ts = time.time() # And this is used to limit clients to X messages per tick # We start at 0 (instead of x) to add a reconnect-penalty. self.lock = RLock()
def publish(self, topic, data): lock = self.lock if not lock: lock = RLock() self.lock = lock with lock: return RedisInterconnect.publish(self, topic, data)
def __init__(self, manager): super(RequestCollection, self).__init__() self._by_request_id = {} self._by_jid = {} self._lock = RLock() self._manager = manager
def get_lock(self, lock_name): from gevent.lock import RLock lock_name = _normalize_path(lock_name) if lock_name not in self._store: lock = RLock() self.write(lock_name, lock) else: lock = self._store[lock_name] return lock
def __init__(self, sync_objects, eventer): super(RequestCollection, self).__init__() self._sync_objects = sync_objects self._eventer = eventer self._by_request_id = {} self._by_jid = {} self._lock = RLock()
def add_sub_key_no_lock(self, sub_key): """ Adds metadata about a given sub_key - must be called with self.lock held. """ # Already seen it - can be ignored if sub_key in self.sub_keys: return self.sub_keys.add(sub_key) self.batch_size[sub_key] = 1 # # A dictionary that maps when GD messages were last time fetched from the SQL database for each sub_key. # Since fetching means we are issuing a single query for multiple sub_keys at a time, we need to fetch only these # messages that are younger than the oldest value for all of the sub_keys whose messages will be fetched. # # Let's say we have three sub_keys: a, b, c # # time 0001: pub to a, b, c # time 0001: store last_gd_run = 0001 for each of a, b, c # time 0002: pub to a, b # time 0002: store last_gd_run = 0002 for a, b # time 0003: pub to b, c # time 0003: store last_gd_run = 0003 for b, c # time 0004: pub to c # time 0004: store last_gd_run = 0004 for c # # We now have: {a:0002, b:0003, c:0004} # # Let's say we now receive: # # time 0005: pub to a, b, c # # Because we want to have a single SQL query for all of a, b, c instead of querying the database for each of sub_key, # we need to look up values stored in this dictionary for each of the sub_key and use the smallest one - in this case # it would be 0002 for sub_key a. Granted, we know that there won't be any keys for b in the timespan of 0002-0003 # or for c in the duration of 0003-004, so in the case of these other keys reaching but so back in time is a bit too much # but this is all fine anyway because the most important part is that we can still use a single SQL query. # # Similarly, had it been a pub to b, c in time 0005 then we would be using min of b and c which is 0003. # # The reason why this is fine is that when we query the database not only do we use this last_gd_run but we also give it # a delivery status to return messages by (initialized only) and on top of it, we provide it a list of message IDs # that are currently being delivered by tasks, so in other words, we never receive duplicates from the databases # that have been already delivered or are about to be. # self.last_gd_run = {} delivery_list = SortedList() delivery_lock = RLock() self.delivery_lists[sub_key] = delivery_list self.delivery_tasks[sub_key] = DeliveryTask( self.pubsub, sub_key, delivery_lock, delivery_list, self.deliver_pubsub_msg, self.confirm_pubsub_msg_delivered, self.pubsub.get_subscription_by_sub_key(sub_key).config) self.sub_key_locks[sub_key] = delivery_lock
def __init__(self, params): self.total = 0 self.processed = 0 from gevent.lock import RLock self.lock = RLock() for item in params: for test_stage in item['test_stages']: for test_name in item['test_stages'][test_stage]: self.total += 1
def __init__(self, session_id, data): self.data = data self.session_id = session_id self._unacked = [] self._pubsub = redis_connection.pubsub() self._read_lock = RLock() self.start_listen()
def __init__(self, config, server=None): # type: (Bunch, ParallelServer) self.config = config self.config.username_pretty = self.config.username or '(None)' self.server = server self._client = None self.delete_requested = False self.is_connected = False self.update_lock = RLock()
def __init__(self, filename, handler_cls, *handler_args, **handler_kwargs): handler_level = handler_kwargs.get("level", logging.NOTSET) logging.Handler.__init__(self, level=handler_level) self.filename = filename self.handlers = {} self.handlers_lock = RLock() self.handler_cls = instance_or_import(handler_cls) self.handler_args = handler_args self.handler_kwargs = handler_kwargs
def __init__(self): self.registry = Registry(self._delete_callback) self.update_lock = RLock() self.permissions = {} self.http_permissions = {} self.role_id_to_name = {} self.role_name_to_id = {} self.client_def_to_role_id = {} self.role_id_to_client_def = {}
def __init__(self, mesos_master_path, zk_servers, **kwargs): self._members = {} self._current_leader = None self._member_lock = RLock() self._zk_server_set = None self._next_on_join = self._on_join(mesos_master_path) self._next_on_leave = self._on_leave(mesos_master_path) super(MesosMasterProxySource, self).__init__(mesos_master_path, zk_servers, **kwargs)
def event_consumer_loop(): # xmlrpclib is not reentrant. We might have several greenlets accessing # supervisor at the same time so we serialize event treatment here lock = RLock() for event in channel: try: with lock: supervisor.publish_event(event) except: logging.exception('Error processing %s', event)
def __init__(self, pd_core): self._pd_core = pd_core self.container = self._pd_core.container self.rr = self.container.resource_registry self._lock = RLock() # Master lock protecting data structures self._containers = {} # Registry of containers self._processes = {} # Registry of processes self.preconditions_true = gevent.event.Event()
def __init__(self, logger, pending_online_users, make_func, send_func): """Initialize Queue Handler :param logger: logger object :type logger: Logger :param pending_online_users: online users queue :type pending_online_users: gevent.queue :param make_func: the function to make bundle :type make_func: lambda,instancemethod,function :param send_func: the function to send bundle :type send_func: lambda,instancemethod,function """ self.alive = True self.last_idx = None self.logger = logger self.pending_online_users = pending_online_users self._pause_lock = RLock() self._make_func = make_func self._send_func = send_func#self._send_func
def __init__(self, pubsub): self.pubsub = pubsub # type: PubSub self.sub_key_to_msg_id = {} # Sub key -> Msg ID set --- What messages are available for a given subcriber self.msg_id_to_sub_key = {} # Msg ID -> Sub key set - What subscribers are interested in a given message self.msg_id_to_msg = {} # Msg ID -> Message data - What is the actual contents of each message self.topic_msg_id = {} # Topic ID -> Msg ID set --- What messages are available for each topic (no matter sub_key) self.lock = RLock() # Start in background a cleanup task that deletes all expired and removed messages spawn_greenlet(self.run_cleanup_task)
def __init__(self, services=None, service_store_config=None, odb=None, server=None): self.services = services self.service_store_config = service_store_config self.odb = odb self.server = server self.id_to_impl_name = {} self.impl_name_to_id = {} self.name_to_impl_name = {} self.update_lock = RLock() self.patterns_matcher = Matcher()
def __init__(self, url): self._close_event = Event() # patch socket.sendall to protect it with lock, # in order to prevent sending data from multiple greenlets concurrently WebSocketClient.__init__(self, url) self._app = None self._lock = RLock() _sendall = self.sock.sendall def sendall(data): self._lock.acquire() try: _sendall(data) except: raise finally: self._lock.release() self.sock.sendall = sendall
def __init__(self, config): self.config = config self.client = ConnectionQueue(self.config.pool_size, self.config.queue_build_cap, self.config.name, 'OpenStack Swift', self.config.auth_url, self.add_client) self.update_lock = RLock() self.logger = getLogger(self.__class__.__name__)
def __init__(self, config, on_message_callback): self.config = config self.on_message_callback = on_message_callback self.address = config.address self.poll_interval = config.poll_interval self.pool_strategy = config.pool_strategy self.service_source = config.service_source self.keep_running = True self.tcp_port = int(self.address.split(':')[-1]) # A hundred years in seconds, used when creating internal workers self.y100 = 60 * 60 * 24 * 365 * 100 # So they do not have to be looked up on each request or event self.has_info = logger.isEnabledFor(logging.INFO) self.has_debug = logger.isEnabledFor(logging.DEBUG) self.has_pool_strategy_simple = self.pool_strategy == ZMQ.POOL_STRATEGY_NAME.SINGLE self.has_service_source_zato = self.service_source == ZMQ.SERVICE_SOURCE_NAME.ZATO self.zato_service_name = config.service_name self.zato_channel = CHANNEL.ZMQ if self.has_pool_strategy_simple: self.workers_pool_initial = 1 self.workers_pool_mult = 0 self.workers_pool_max = 1 else: self.workers_pool_initial = config.workers_pool_initial self.workers_pool_mult = config.workers_pool_mult self.workers_pool_max = config.workers_pool_max # Maps service names to workers registered to handle requests to that service self.services = {} # Details about each worker, mapped by worker_id:Worker object self.workers = {} # Held upon most operations on sockets self.lock = RLock() # How often, in seconds, to send a heartbeat to workers self.heartbeat = config.heartbeat self.ctx = zmq.Context() self.socket = self.ctx.socket(zmq.ROUTER) self.socket.linger = config.linger self.poller = zmq.Poller() self.poller.register(self.socket, zmq.POLLIN) # Maps event IDs to methods that handle a given one self.handle_event_map = { const.v01.ready: self.on_event_ready, const.v01.reply_from_worker: self.on_event_reply, const.v01.heartbeat: self.on_event_heartbeat, const.v01.disconnect: self.on_event_disconnect, }
def __init__(self): self.client = None self.server = None self.datastore = None self.hdf = None self.hdf_group = None self.config = None self.event_queue = collections.deque() self.event_lock = RLock() self.logger = logging.getLogger('statd') self.data_sources = {}
def __init__(self, config, conn_type): self.conn_type = conn_type self.config = config self.client = ConnectionQueue(self.config.pool_size, self.config.queue_build_cap, self.config.name, self.conn_type, self.config.auth_url, self.add_client) self.update_lock = RLock() self.logger = logging.getLogger(self.__class__.__name__)
def __init__(self, logger): Greenlet.__init__(self) self.logger = logger self._users_lock = RLock() self._msgs = {} self._users = {} self.send_queue = Queue() self.pending_online_users = Queue() self.bootstrap() self._dying = False self.start()
def __init__(self, server): self.server = server self.lock = RLock() self.default = _NotConfiguredAPI() self.caches = { CACHE.TYPE.BUILTIN: {}, CACHE.TYPE.MEMCACHED: {}, } self.builtin = self.caches[CACHE.TYPE.BUILTIN] self.memcached = self.caches[CACHE.TYPE.MEMCACHED]
def __init__(self, args): # TODO: make separate queues for fast logging self.args = args self.lock = RLock() # Colorama init init() # Initialise logging self._init_logger() # Initialise output self._init_output() # Stats self.urls_scanned = 0
def __init__(self): httplib.HTTPConnection.debuglevel = self.http_debuglevel self.network = NetworkManager(crawler=self) self.pool = Pool() self.lock = RLock() self.bloom_filters = {} self.name = self.__class__.__name__ self._status = { 'process_count': 0, 'is_stop': True, 'run_seconds': 0, 'crawler_name': self.name, }
def __init__(self, dispatcher): self.dispatcher = dispatcher self.task_list = [] self.task_queue = Queue() self.resource_graph = dispatcher.resource_graph self.threads = [] self.executors = [] self.logger = logging.getLogger('Balancer') self.dispatcher.require_collection('tasks', 'serial', type='log') self.create_initial_queues() self.schedule_lock = RLock() self.distribution_lock = RLock() self.debugger = None self.debugged_tasks = None self.dispatcher.register_event_type('task.changed')
def __init__(self): self.pending_calls = {} self.pending_events = [] self.event_handlers = {} self.rpc = None self.event_callback = None self.error_callback = None self.rpc_callback = None self.receive_thread = None self.token = None self.event_distribution_lock = RLock() self.event_emission_lock = RLock() self.default_timeout = 20 self.scheme = None self.transport = None self.parsed_url = None self.last_event_burst = None self.use_bursts = False self.event_cv = Event() self.event_thread = None
def __init__(self, dispatcher): self.dispatcher = dispatcher self.task_list = [] self.task_queue = Queue() self.resource_graph = dispatcher.resource_graph self.threads = [] self.executors = [] self.logger = logging.getLogger("Balancer") self.dispatcher.require_collection("tasks", "serial", type="log") self.create_initial_queues() self.start_executors() self.schedule_lock = RLock() self.distribution_lock = RLock() self.debugger = None self.debugged_tasks = None self.dispatcher.register_event_type("task.changed") # Lets try to get `EXECUTING|WAITING|CREATED` state tasks # from the previous dispatcher instance and set their # states to 'FAILED' since they are no longer running # in this instance of the dispatcher for stale_task in dispatcher.datastore.query("tasks", ("state", "in", ["EXECUTING", "WAITING", "CREATED"])): self.logger.info( "Stale task ID: {0}, name: {1} being set to FAILED".format(stale_task["id"], stale_task["name"]) ) stale_task.update( { "state": "FAILED", "error": { "type": "TaskException", "message": "dispatcher process died", "code": errno.EINTR, "stacktrace": "", "extra": None, }, } ) dispatcher.datastore.update("tasks", stale_task["id"], stale_task)
def __init__(self, dispatcher): self.dispatcher = dispatcher self.task_list = [] self.task_queue = Queue() self.resource_graph = dispatcher.resource_graph self.threads = [] self.executors = [] self.logger = logging.getLogger('Balancer') self.dispatcher.require_collection('tasks', 'serial', type='log') self.create_initial_queues() self.start_executors() self.schedule_lock = RLock() self.distribution_lock = RLock() self.debugger = None self.debugged_tasks = None self.dispatcher.register_event_type('task.changed') # Lets try to get `EXECUTING|WAITING|CREATED` state tasks # from the previous dispatcher instance and set their # states to 'FAILED' since they are no longer running # in this instance of the dispatcher for stale_task in dispatcher.datastore.query('tasks', ('state', 'in', ['EXECUTING', 'WAITING', 'CREATED'])): self.logger.info('Stale task ID: {0}, name: {1} being set to FAILED'.format( stale_task['id'], stale_task['name'] )) stale_task.update({ 'state': 'FAILED', 'error': { 'type': 'TaskException', 'message': 'dispatcher process died', 'code': errno.EINTR, 'stacktrace': '', 'extra': None } }) dispatcher.datastore.update('tasks', stale_task['id'], stale_task)
class Task: """ Task描述一次服务数据请求处理 Task任务链式钩挂 """ class Profile: def __init__(self): self.start_time = 0 self.end_time = 0 self.last_watch_time = 0 #最近一次观察记录时间 self.status = JobStatusType.STOPPED self.result = None # task 运行结果 def __init__(self, proxy, runner): self.proxy = proxy self.next = None self.prev = None self.task_id = JobService.instance().generateUniqueID() self.runner = runner self.profile = Task.Profile() self.locker = RLock() def chainNext(self, task): self.next = task self.next.prev = self return self def getUniqueID(self): return self.task_id @property def ID(self): return self.getUniqueID() def execute(self, job): self.locker.acquire() try: task_id = self.getUniqueID() result = self.proxy.createTask(task_id, job) if result.status == CallReturnStatusValueType.SUCC: self.profile.start_time = int(time.time()) self.profile.status = JobStatusType.RUNNING JobService.instance().onJobTaskStarted(self) return result finally: self.locker.release() def onFinished(self, task_result): self.locker.acquire() try: self.profile.end_time = int(time.time()) self.profile.status = JobStatusType.FINISHED self.profile.result = task_result self.runner.getProfile().result = task_result # self.runner.onTaskFinished(self) finally: self.locker.release() def onError(self, task_result={}): self.locker.acquire() try: self.profile.end_time = int(time.time()) self.profile.status = JobStatusType.FAILED self.runner.getProfile().result = task_result self.runner.onTaskError(self) finally: self.locker.release() def onWatchTime(self): return try: result = self.proxy.watchTask(self.getUniqueID()) self.profile.last_watch_time = int(time.time()) except: traceback.print_exc()
class Balancer(object): def __init__(self, dispatcher): self.dispatcher = dispatcher self.task_list = [] self.task_queue = Queue() self.resource_graph = dispatcher.resource_graph self.threads = [] self.executors = [] self.logger = logging.getLogger("Balancer") self.dispatcher.require_collection("tasks", "serial", type="log") self.create_initial_queues() self.start_executors() self.schedule_lock = RLock() self.distribution_lock = RLock() self.debugger = None self.debugged_tasks = None self.dispatcher.register_event_type("task.changed") # Lets try to get `EXECUTING|WAITING|CREATED` state tasks # from the previous dispatcher instance and set their # states to 'FAILED' since they are no longer running # in this instance of the dispatcher for stale_task in dispatcher.datastore.query("tasks", ("state", "in", ["EXECUTING", "WAITING", "CREATED"])): self.logger.info( "Stale task ID: {0}, name: {1} being set to FAILED".format(stale_task["id"], stale_task["name"]) ) stale_task.update( { "state": "FAILED", "error": { "type": "TaskException", "message": "dispatcher process died", "code": errno.EINTR, "stacktrace": "", "extra": None, }, } ) dispatcher.datastore.update("tasks", stale_task["id"], stale_task) def create_initial_queues(self): self.resource_graph.add_resource(Resource("system")) def start_executors(self): for i in range(0, self.dispatcher.configstore.get("middleware.executors_count")): self.logger.info("Starting task executor #{0}...".format(i)) self.executors.append(TaskExecutor(self, i)) def start(self): self.threads.append(gevent.spawn(self.distribution_thread)) self.logger.info("Started") def schema_to_list(self, schema): return { "type": "array", "items": schema, "minItems": sum([1 for x in schema if "mandatory" in x and x["mandatory"]]), "maxItems": len(schema), } def verify_schema(self, clazz, args, strict=False): if not hasattr(clazz, "params_schema"): return [] schema = self.schema_to_list(clazz.params_schema) val = validator.create_validator(schema, resolver=self.dispatcher.rpc.get_schema_resolver(schema)) if strict: val.fail_read_only = True else: val.remove_read_only = True return list(val.iter_errors(args)) def submit(self, name, args, sender, env=None): if name not in self.dispatcher.tasks: self.logger.warning("Cannot submit task: unknown task type %s", name) raise RpcException(errno.EINVAL, "Unknown task type {0}".format(name)) task = Task(self.dispatcher, name) task.user = sender.user.name task.session_id = sender.session_id task.created_at = datetime.utcnow() task.clazz = self.dispatcher.tasks[name] task.hooks = self.dispatcher.task_hooks.get(name, {}) task.args = copy.deepcopy(args) task.strict_verify = "strict_validation" in sender.enabled_features if env: if not isinstance(env, dict): raise ValueError("env must be a dict") task.environment = copy.deepcopy(env) if self.debugger: for m in self.debugged_tasks: if fnmatch.fnmatch(name, m): task.debugger = self.debugger if "RUN_AS_USER" in task.environment: task.user = task.environment["RUN_AS_USER"] task.environment["SENDER_ADDRESS"] = sender.client_address task.id = self.dispatcher.datastore.insert("tasks", task) task.set_state(TaskState.CREATED) self.task_queue.put(task) self.logger.info("Task %d submitted (type: %s, class: %s)", task.id, name, task.clazz) return task.id def submit_with_upload(self, task_name, args, sender, env=None): task_metadata = self.dispatcher.tasks[task_name]._get_metadata() schema = task_metadata["schema"] if schema is None: raise RpcException(errno.ENOENT, "Task {0} has no schema associated with it".format(task_name)) upload_token_list = [] for idx, arg in enumerate(schema): if arg.get("type") == "fd": rfd, wfd = os.pipe() token = self.dispatcher.token_store.issue_token( FileToken( user=sender.user, lifetime=60, direction="upload", file=FileObjectPosix(wfd, "wb", close=True), name=str(uuid.uuid4()), size=None, ) ) upload_token_list.append(token) args[idx] = FileDescriptor(rfd) task_id = self.submit(task_name, args, sender, env) return task_id, upload_token_list def submit_with_download(self, task_name, args, sender, env=None): task_metadata = self.dispatcher.tasks[task_name]._get_metadata() schema = task_metadata["schema"] url_list = [] if schema is None: raise RpcException(errno.ENOENT, "Task {0} has no schema associated with it".format(task_name)) for idx, arg in enumerate(schema): if arg.get("type") == "fd": rfd, wfd = os.pipe() url_list.append( "/dispatcher/filedownload?token={0}".format( self.dispatcher.token_store.issue_token( FileToken( user=sender.user, lifetime=60, direction="download", file=FileObjectPosix(rfd, "rb", close=True), name=args[idx], ) ) ) ) args[idx] = FileDescriptor(wfd) task_id = self.submit(task_name, args, sender, env) return task_id, url_list def verify_subtask(self, parent, name, args): clazz = self.dispatcher.tasks[name] instance = clazz(self.dispatcher, self.dispatcher.datastore) return instance.verify(*args) def run_subtask(self, parent, name, args, env=None): args = list(args) task = Task(self.dispatcher, name) task.created_at = datetime.utcnow() task.clazz = self.dispatcher.tasks[name] task.hooks = self.dispatcher.task_hooks.get(name, {}) task.args = args task.instance = task.clazz(self.dispatcher, self.dispatcher.datastore) task.instance.verify(*task.args) task.description = task.instance.describe(*task.args) task.id = self.dispatcher.datastore.insert("tasks", task) task.parent = parent task.environment = {} if parent: task.environment = copy.deepcopy(parent.environment) task.environment["parent"] = parent.id task.user = parent.user if env: if not isinstance(env, dict): raise ValueError("env must be a dict") task.environment.update(env) if self.debugger: for m in self.debugged_tasks: if fnmatch.fnmatch(name, m): task.debugger = self.debugger task.set_state(TaskState.CREATED) self.task_list.append(task) task.start() return task def join_subtasks(self, *tasks): for i in tasks: i.join() def abort(self, id, error=None): task = self.get_task(id) if not task: self.logger.warning("Cannot abort task: unknown task id %d", id) return success = False if task.started_at is None: success = True else: try: task.executor.abort() except: pass if success: task.ended.set() if error: task.set_state(TaskState.FAILED, TaskStatus(0), serialize_error(error)) self.logger.debug("Task ID: %d, name: %s aborted with error", task.id, task.name) else: task.set_state(TaskState.ABORTED, TaskStatus(0, "Aborted")) self.logger.debug("Task ID: %d, name: %s aborted by user", task.id, task.name) def task_exited(self, task): self.resource_graph.release(*task.resources) self.schedule_tasks(True) def schedule_tasks(self, exit=False): """ This function is called when: 1) any new task is submitted to any of the queues 2) any task exists """ with self.schedule_lock: started = 0 executing_tasks = [t for t in self.task_list if t.state == TaskState.EXECUTING] waiting_tasks = [t for t in self.task_list if t.state == TaskState.WAITING] for task in waiting_tasks: if not self.resource_graph.can_acquire(*task.resources): continue self.resource_graph.acquire(*task.resources) self.threads.append(task.start()) started += 1 if not started and not executing_tasks and (exit or len(waiting_tasks) == 1): for task in waiting_tasks: # Check whether or not task waits on nonexistent resources. If it does, # abort it 'cause there's no chance anymore that missing resources will appear. if any(self.resource_graph.get_resource(res) is None for res in task.resources): self.logger.warning("Aborting task {0}: deadlock".format(task.id)) self.abort(task.id, VerifyException(errno.EBUSY, "Resource deadlock avoided")) def distribution_thread(self): while True: self.task_queue.peek() self.distribution_lock.acquire() task = self.task_queue.get() try: self.logger.debug("Picked up task %d: %s with args %s", task.id, task.name, task.args) errors = self.verify_schema(self.dispatcher.tasks[task.name], task.args, task.strict_verify) if len(errors) > 0: errors = list(validator.serialize_errors(errors)) self.logger.warning( "Cannot submit task {0}: schema verification failed with errors {1}".format(task.name, errors) ) raise ValidationException(extra=errors) task.instance = task.clazz(self.dispatcher, self.dispatcher.datastore) task.resources = task.instance.verify(*task.args) task.description = task.instance.describe(*task.args) if type(task.resources) is not list: raise ValueError("verify() returned something else than resource list") except Exception as err: self.logger.warning("Cannot verify task %d: %s", task.id, err) task.set_state(TaskState.FAILED, TaskStatus(0), serialize_error(err)) task.ended.set() self.distribution_lock.release() if not isinstance(err, VerifyException): self.dispatcher.report_error("Task {0} verify() method raised invalid exception".format(err), err) continue task.set_state(TaskState.WAITING) self.task_list.append(task) self.distribution_lock.release() self.schedule_tasks() if task.resources: self.logger.debug("Task %d assigned to resources %s", task.id, ",".join(task.resources)) def assign_executor(self, task): for i in self.executors: with i.cv: if i.state == WorkerState.IDLE: self.logger.info("Task %d assigned to executor #%d", task.id, i.index) task.executor = i i.state = WorkerState.ASSIGNED return # Out of executors! Need to spawn new one executor = TaskExecutor(self, len(self.executors)) self.executors.append(executor) with executor.cv: executor.cv.wait_for(lambda: executor.state == WorkerState.IDLE) executor.state = WorkerState.ASSIGNED task.executor = executor self.logger.info("Task %d assigned to executor #%d", task.id, executor.index) def dispose_executors(self): for i in self.executors: i.die() def get_active_tasks(self): return [x for x in self.task_list if x.state in (TaskState.CREATED, TaskState.WAITING, TaskState.EXECUTING)] def get_tasks(self, type=None): if type is None: return self.task_list return [x for x in self.task_list if x.state == type] def get_task(self, id): self.distribution_lock.acquire() t = first_or_default(lambda x: x.id == id, self.task_list) if not t: t = first_or_default(lambda x: x.id == id, self.task_queue.queue) self.distribution_lock.release() return t def get_executor_by_key(self, key): return first_or_default(lambda t: t.key == key, self.executors) def get_executor_by_sender(self, sender): return first_or_default(lambda t: t.conn == sender, self.executors)
def __init__(self, states, events, enter_event, exit_event): self._lock = RLock() super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)
class ResourceGraph(object): def __init__(self): self.logger = logging.getLogger('ResourceGraph') self.mutex = RLock() self.root = Resource('root') self.resources = nx.DiGraph() self.resources.add_node(self.root) def lock(self): self.mutex.acquire() def unlock(self): self.mutex.release() @property def nodes(self): return self.resources.nodes() def add_resource(self, resource, parents=None): self.lock() if not resource: self.unlock() raise ResourceError('Invalid resource') if self.get_resource(resource.name): self.unlock() raise ResourceError('Resource {0} already exists'.format(resource.name)) self.resources.add_node(resource) if not parents: parents = ['root'] for p in parents: node = self.get_resource(p) if not node: self.unlock() raise ResourceError('Invalid parent resource {0}'.format(p)) self.resources.add_edge(node, resource) self.unlock() def remove_resource(self, name): self.lock() resource = self.get_resource(name) if not resource: self.unlock() return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) self.resources.remove_node(resource) self.unlock() def update_resource(self, name, new_parents): self.lock() resource = self.get_resource(name) if not resource: self.unlock() return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) for p in new_parents: node = self.get_resource(p) if not node: self.unlock() raise ResourceError('Invalid parent resource {0}'.format(p)) self.resources.add_edge(node, resource) self.unlock() def get_resource(self, name): f = [i for i in self.resources.nodes() if i.name == name] return f[0] if len(f) > 0 else None def get_resource_dependencies(self, name): res = self.get_resource(name) for i, _ in self.resources.in_edges([res]): yield i.name def acquire(self, *names): self.lock() self.logger.debug('Acquiring following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) if not res: raise ResourceError('Resource {0} not found'.format(name)) for i in nx.descendants(self.resources, res): if i.busy: self.unlock() raise ResourceError('Cannot acquire, some of dependent resources are busy') res.busy = True self.unlock() def can_acquire(self, *names): self.lock() self.logger.debug('Trying to acquire following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) if not res: self.unlock() return False if res.busy: self.unlock() return False for i in nx.descendants(self.resources, res): if i.busy: self.unlock() return False self.unlock() return True def release(self, *names): self.lock() self.logger.debug('Releasing following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) res.busy = False self.unlock()
class Client(object): class PendingCall(object): def __init__(self, id, method, args=None): self.id = id self.method = method self.args = args self.result = None self.error = None self.completed = Event() self.callback = None class SubscribedEvent(object): def __init__(self, name, *filters): self.name = name self.refcount = 0 self.filters = filters def match(self, name, args): if self.name != name: return False if self.filters: return match(args, *self.filters) def __init__(self): self.pending_calls = {} self.pending_events = [] self.event_handlers = {} self.rpc = None self.event_callback = None self.error_callback = None self.rpc_callback = None self.receive_thread = None self.token = None self.event_distribution_lock = RLock() self.event_emission_lock = RLock() self.default_timeout = 20 self.scheme = None self.transport = None self.parsed_url = None self.last_event_burst = None self.use_bursts = False self.event_cv = Event() self.event_thread = None def __pack(self, namespace, name, args, id=None): return dumps({ 'namespace': namespace, 'name': name, 'args': args, 'id': str(id if id is not None else uuid.uuid4()) }) def __call_timeout(self, call): pass def __call(self, pending_call, call_type='call', custom_payload=None): if custom_payload is None: payload = { 'method': pending_call.method, 'args': pending_call.args, } else: payload = custom_payload self.__send(self.__pack( 'rpc', call_type, payload, pending_call.id )) def __send_event(self, name, params): self.__send(self.__pack( 'events', 'event', {'name': name, 'args': params} )) def __send_event_burst(self): with self.event_emission_lock: self.__send(self.__pack( 'events', 'event_burst', {'events': list([{'name': t[0], 'args': t[1]} for t in self.pending_events])}, )) del self.pending_events[:] def __send_error(self, id, errno, msg, extra=None): payload = { 'code': errno, 'message': msg } if extra is not None: payload.update(extra) self.__send(self.__pack('rpc', 'error', id=id, args=payload)) def __send_response(self, id, resp): self.__send(self.__pack('rpc', 'response', id=id, args=resp)) def __send(self, data): debug_log('<- {0}', data) self.transport.send(data) def recv(self, message): if isinstance(message, bytes): message = message.decode('utf-8') debug_log('-> {0}', message) try: msg = loads(message) except ValueError as err: if self.error_callback is not None: self.error_callback(ClientError.INVALID_JSON_RESPONSE, err) return self.decode(msg) def __process_event(self, name, args): self.event_distribution_lock.acquire() if name in self.event_handlers: for h in self.event_handlers[name]: h(args) if self.event_callback: self.event_callback(name, args) self.event_distribution_lock.release() def __event_emitter(self): while True: self.event_cv.wait() while len(self.pending_events) > 0: time.sleep(0.1) with self.event_emission_lock: self.__send_event_burst() def wait_forever(self): if os.getenv("DISPATCHERCLIENT_TYPE") == "GEVENT": import gevent while True: gevent.sleep(60) else: while True: time.sleep(60) def drop_pending_calls(self): message = "Connection closed" for key, call in self.pending_calls.items(): call.result = None call.error = { "code": errno.ECONNABORTED, "message": message } call.completed.set() del self.pending_calls[key] def decode(self, msg): if 'namespace' not in msg: self.error_callback(ClientError.INVALID_JSON_RESPONSE) return if 'name' not in msg: self.error_callback(ClientError.INVALID_JSON_RESPONSE) return if msg['namespace'] == 'events' and msg['name'] == 'event': args = msg['args'] t = spawn_thread(target=self.__process_event, args=(args['name'], args['args'])) t.start() return if msg['namespace'] == 'events' and msg['name'] == 'event_burst': args = msg['args'] for i in args['events']: t = spawn_thread(target=self.__process_event, args=(i['name'], i['args'])) t.start() return if msg['namespace'] == 'events' and msg['name'] == 'logout': self.error_callback(ClientError.LOGOUT) return if msg['namespace'] == 'rpc': if msg['name'] == 'call': if self.rpc is None: self.__send_error(msg['id'], errno.EINVAL, 'Server functionality is not supported') return if 'args' not in msg: self.__send_error(msg['id'], errno.EINVAL, 'Malformed request') return args = msg['args'] if 'method' not in args or 'args' not in args: self.__send_error(msg['id'], errno.EINVAL, 'Malformed request') return def run_async(msg, args): try: result = self.rpc.dispatch_call(args['method'], args['args'], sender=self) except rpc.RpcException as err: self.__send_error(msg['id'], err.code, err.message) else: self.__send_response(msg['id'], result) t = spawn_thread(target=run_async, args=(msg, args)) t.start() return if msg['name'] == 'response': if msg['id'] in self.pending_calls.keys(): call = self.pending_calls[msg['id']] call.result = msg['args'] call.completed.set() if call.callback is not None: call.callback(msg['args']) del self.pending_calls[str(call.id)] else: if self.error_callback is not None: self.error_callback(ClientError.SPURIOUS_RPC_RESPONSE, msg['id']) if msg['name'] == 'error': if msg['id'] in self.pending_calls.keys(): call = self.pending_calls[msg['id']] call.result = None call.error = msg['args'] call.completed.set() del self.pending_calls[str(call.id)] if self.error_callback is not None: self.error_callback(ClientError.RPC_CALL_ERROR) def parse_url(self, url): self.parsed_url = urlsplit(url, scheme="http") self.scheme = self.parsed_url.scheme def connect(self, url, **kwargs): self.parse_url(url) if not self.scheme: self.scheme = kwargs.get('scheme',"ws") else: if 'scheme' in kwargs: raise ValueError('Connection scheme cannot be delared in both url and arguments.') if self.scheme is "http": self.scheme = "ws" builder = ClientTransportBuilder() self.transport = builder.create(self.scheme) self.transport.connect(self.parsed_url, self, **kwargs) debug_log('Connection opened, local address {0}', self.transport.address) if self.use_bursts: self.event_thread = spawn_thread(target=self.__event_emitter, args=()) self.event_thread.start() def login_user(self, username, password, timeout=None): call = self.PendingCall(uuid.uuid4(), 'auth') self.pending_calls[str(call.id)] = call self.__call(call, call_type='auth', custom_payload={'username': username, 'password': password}) call.completed.wait(timeout) if call.error: raise rpc.RpcException( call.error['code'], call.error['message'], call.error['extra'] if 'extra' in call.error else None) self.token = call.result[0] def login_service(self, name, timeout=None): call = self.PendingCall(uuid.uuid4(), 'auth') self.pending_calls[str(call.id)] = call self.__call(call, call_type='auth_service', custom_payload={'name': name}) if call.error: raise rpc.RpcException( call.error['code'], call.error['message'], call.error['extra'] if 'extra' in call.error else None) call.completed.wait(timeout) def login_token(self, token, timeout=None): call = self.PendingCall(uuid.uuid4(), 'auth') self.pending_calls[str(call.id)] = call self.__call(call, call_type='auth_token', custom_payload={'token': token}) call.completed.wait(timeout) if call.error: raise rpc.RpcException( call.error['code'], call.error['message'], call.error['extra'] if 'extra' in call.error else None) self.token = call.result[0] def disconnect(self): debug_log('Closing connection, local address {0}', self.transport.address) self.transport.close() def enable_server(self): self.rpc = rpc.RpcContext() def on_event(self, callback): self.event_callback = callback def on_call(self, callback): self.rpc_callback = callback def on_error(self, callback): self.error_callback = callback def subscribe_events(self, *masks): self.__send(self.__pack('events', 'subscribe', masks)) def unsubscribe_events(self, *masks): self.__send(self.__pack('events', 'unsubscribe', masks)) def register_service(self, name, impl): if self.rpc is None: raise RuntimeError('Call enable_server() first') self.rpc.register_service_instance(name, impl) self.call_sync('plugin.register_service', name) def unregister_service(self, name): if self.rpc is None: raise RuntimeError('Call enable_server() first') self.rpc.unregister_service(name) self.call_sync('plugin.unregister_service', name) def resume_service(self, name): if self.rpc is None: raise RuntimeError('Call enable_server() first') self.call_sync('plugin.resume_service', name) def register_schema(self, name, schema): if self.rpc is None: raise RuntimeError('Call enable_server() first') self.call_sync('plugin.register_schema', name, schema) def unregister_schema(self, name): if self.rpc is None: raise RuntimeError('Call enable_server() first') self.call_sync('plugin.unregister_schema', name) def call_async(self, name, callback, *args): call = self.PendingCall(uuid.uuid4(), name, args) self.pending_calls[call.id] = call def call_sync(self, name, *args, **kwargs): timeout = kwargs.pop('timeout', self.default_timeout) call = self.PendingCall(uuid.uuid4(), name, args) self.pending_calls[str(call.id)] = call self.__call(call) if not call.completed.wait(timeout): if self.error_callback: self.error_callback(ClientError.RPC_CALL_TIMEOUT, method=call.method, args=call.args) raise rpc.RpcException(errno.ETIMEDOUT, 'Call timed out') if call.result is None and call.error is not None: raise rpc.RpcException( call.error['code'], call.error['message'], call.error['extra'] if 'extra' in call.error else None) return call.result def call_task_sync(self, name, *args): tid = self.call_sync('task.submit', name, args) self.call_sync('task.wait', tid, timeout=3600) return self.call_sync('task.status', tid) def submit_task(self, name, *args): return self.call_sync('task.submit', name, args) def emit_event(self, name, params): if not self.use_bursts: self.__send_event(name, params) else: self.pending_events.append((name, params)) self.event_cv.set() self.event_cv.clear() def register_event_handler(self, name, handler): if name not in self.event_handlers: self.event_handlers[name] = [] self.event_handlers[name].append(handler) self.subscribe_events(name) return handler def unregister_event_handler(self, name, handler): self.event_handlers[name].remove(handler) def exec_and_wait_for_event(self, event, match_fn, fn, timeout=None): done = Event() self.subscribe_events(event) self.event_distribution_lock.acquire() try: fn() except: self.event_distribution_lock.release() raise def handler(args): if match_fn(args): done.set() self.register_event_handler(event, handler) self.event_distribution_lock.release() done.wait(timeout=timeout) self.unregister_event_handler(event, handler) def test_or_wait_for_event(self, event, match_fn, initial_condition_fn, timeout=None): done = Event() self.subscribe_events(event) self.event_distribution_lock.acquire() if initial_condition_fn(): self.event_distribution_lock.release() return def handler(args): if match_fn(args): done.set() self.register_event_handler(event, handler) self.event_distribution_lock.release() done.wait(timeout=timeout) self.unregister_event_handler(event, handler) def get_lock(self, name): self.call_sync('lock.init', name) return rpc.ServerLockProxy(self, name)
class Crawler(object): """定向爬虫类""" http_debuglevel = 0 #: 预定义网页编码。 encoding = None #: 设置User Agent,有时候模拟Google Bot会有事倍功半的效果。 user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)' # 页面语言,有些网站会以这个为标记实现国际化 accept_language = 'zh_CN' # 可接受的数据类型 accept_mine = 'text/html,application/xhtml+xml,' \ 'application/xml;q=0.9,*/*;q=0.8' #: 最大重定向次数,防止重定向陷阱。 max_redirects = 20 #: 每个爬虫的最大并发连接数。 max_connections = 10 #: 超时。 timeout = 360 #: 最大失败尝试次数。 max_retries = 1000 #: 每次尝试后递增休眠间隔。 #: 例如 ``sleep_seconds = 2`` ,那么第一次连接失败会休眠2秒,第二次会休眠4秒,第三次会休眠6秒。 sleep_seconds = 1 #: Bloom容量 bloom_capacity = 10000000 #: Bloom预计错误率 bloom_error_rate = 0.0001 #: HTTP代理 proxies = None #: 错误日志存放处 dump_dir = 'dump/' is_stop = True stopped = False name = None retry_with_broken_content = False retry_with_no_content = False #: 如果服务器遇到这些error code,当做正常页面处理 ignore_server_error_code = () #: 如果服务器遇到这些error code,不进行重试,直接忽略掉 do_not_retry_with_server_error_code = () lock = None logger = logging.getLogger('Crawler') def __init__(self): httplib.HTTPConnection.debuglevel = self.http_debuglevel self.network = NetworkManager(crawler=self) self.pool = Pool() self.lock = RLock() self.bloom_filters = {} self.name = self.__class__.__name__ self._status = { 'process_count': 0, 'is_stop': True, 'run_seconds': 0, 'crawler_name': self.name, } # def sync_bloom(self): # """强行同步Bloom到文件""" # # while not self.is_stop: # for key in self.bloom_filters.keys(): # self.bloom_filters[key].sync() # gevent.sleep(1) def work(self): """启动爬虫。""" if self.lock.acquire(blocking=False): self.logger.info('Starting crawler %s' % self.name) self.stopped = False self._status['is_stop'] = False self.pool.spawn(self.run) self.pool.join() self.network.join() self._status['is_stop'] = True self.logger.info('Finished crawler %s' % self.name) self.lock.release() def on_server_error(self, response): """服务器错误回调。 :param response: :raise ServerError: """ self.logger.warning('Something wrong with server.') raise ServerError('Error Code:%s' % response.status_code) def on_proxies_error(self, proxy): pass def on_parse_error(self, error): """页面分析错误回调 :param error: """ def fetch_proxies(self): pass def stop(self): """停止爬虫。 """ self.logger.info('Stopping crawler %s' % self.name) self.stopped = True while not self.network._request_queue.empty(): self.network._request_queue.get() def status(self): """返回爬虫状态。 :return: :rtype: """ return self._status def run(self): """这里编写启动爬虫的工作。 必须重载此函数,推倒第一块多米诺骨牌。 """ raise NotImplementedError
class DataMgr(Greenlet): pickle_names = ['_msgs', '_users', 'send_queue', 'pending_online_users'] data_version = 1000 def __init__(self, logger): Greenlet.__init__(self) self.logger = logger self._users_lock = RLock() self._msgs = {} self._users = {} self.send_queue = Queue() self.pending_online_users = Queue() self.bootstrap() self._dying = False self.start() def bootstrap(self): """Restore data from disk""" _ = opath.join(DATA_DIR, DM_PKL_NAME) if opath.exists(_): _ = pickle.load(file(_, 'rb')) if '_version' not in _ or _['_version'] != DataMgr.data_version: raise Exception(" pkl file mismatch:program(%d) file(%d)" % (DataMgr.data_version, None if '_version' not in _ else _['_version'])) self.__dict__.update(_) def shutdown(self): """Save data to disk""" self._dying = True self.logger.debug('[DM] saving data to disk...') self._save_cache() def reset(self): """reset in-memory data and disk data""" self.send_queue = Queue() self.pending_online_users = Queue() _ = opath.join(DATA_DIR, DM_PKL_NAME) if opath.exists(_): os.remove(_) def _save_cache(self): # fixme: save to external database not implemented _ = {'_version':DataMgr.data_version} for k in DataMgr.pickle_names: if k in self.__dict__: _[k] = self.__dict__[k] #pickle.dump(_, file(opath.join(DATA_DIR, DM_PKL_NAME), 'wb'), pickle.HIGHEST_PROTOCOL) def msg_add(self, msg): """add message to msg_queue :param msg: msg to add :type msg: MessageObj """ if not isinstance(msg, MessageObj): raise ValueError(" argument is not a MessageObj") self._msgs[msg.msgid] = msg def msg_get(self, msgid): """get message by msgid :param msgid: message id :type msgid: int """ if msgid not in self._msgs: raise IndexError(" msgid %s not in queue" % idx) return self._msgs[msgid] def msg_del(self, msgid): """del message by msgid :param msgid: message id :type msgid: int """ del self._msgs[msgid] def msg_set(self, msgid, msg): self._msgs[msgid] = msg @property def msg_count(self): """get message queue length """ return len(self._msgs) def set_user_online(self, guid): """set user to online this will generate a UserObj instance :param guid: user guid :type guid: int """ #TODO get userid from rid uid = "u" + guid u = UserObj(uid, guid) self.users_add(u) self.pending_online_users.put(guid) def set_user_offline(self, guid): """set user to offline :param guid: user guid """ #TODO get userid from rid self.users_del(guid) def users_add(self, u): """add a user instance to user queue :param u: user instance :type u: UserObj """ if not isinstance(u, UserObj): raise ValueError(" argument is not a UserObj") self._users_lock.acquire() self._users[u.guid] = u self._users_lock.release() def users_get(self, guid): """get user by guid :param guid: user guid """ if guid not in self._users: raise IndexError(" guid %s not in users list" % guid) return self._users[guid] def users_del(self, guid): """del user by guid :param guid: user guid """ if '-' in guid: # convert to bytes guid = binascii.unhexlify(guid) if guid not in self._users: raise IndexError(" guid %s not in users list" % guid) self._users_lock.acquire() del self._users[guid] self._users_lock.release() @property def users_count(self): """get user queue length """ return len(self._users) def make_bundle(self, send_func, user_keys = None): """make bundle and call send_func :param send_func: the function to call on generated bundles :type send_func: lambda, function, instancemethod :param user_keys: user guid list to do the match func :type send_func: list """ user_keys = user_keys or self._users.keys() self.logger.debug('[DM] begin mapping of %du * %dm' % (len(user_keys), self.msg_count)) cnt = 0 user_keys = sorted(user_keys, key = lambda x:self._users[x].pr, reverse = True) for k in user_keys: u = self._users[k] for _k, m in self._msgs.iteritems(): _ = u.gen_bundle(m) if _: cnt += 1 send_func(_) if cnt: self.logger.debug('[DM] queued %d new bundles' % cnt) return cnt def run(self): """the background thread that automatically do n*m mapping """ self.mongo_instance = mongo() while not self._dying: msgids = self.mongo_instance.event_get_id(0) for i in msgids: # generate new MessageObj instance m = MessageObj( payload_callback = lambda d = i:self.mongo_instance.event_get_single_info(d), msgid = i ) self.msg_add(m) gevent.sleep(60) self._save_cache()
class Balancer(object): def __init__(self, dispatcher): self.dispatcher = dispatcher self.task_list = [] self.task_queue = Queue() self.resource_graph = dispatcher.resource_graph self.queues = {} self.threads = [] self.executors = [] self.logger = logging.getLogger('Balancer') self.dispatcher.require_collection('tasks', 'serial', type='log') self.create_initial_queues() self.start_executors() self.distribution_lock = RLock() self.debugger = None self.debugged_tasks = None self.dispatcher.register_event_type('task.changed') # Lets try to get `EXECUTING|WAITING|CREATED` state tasks # from the previous dispatcher instance and set their # states to 'FAILED' since they are no longer running # in this instance of the dispatcher for stale_task in dispatcher.datastore.query('tasks', ('state', 'in', ['EXECUTING', 'WAITING', 'CREATED'])): self.logger.info('Stale Task ID: {0} Name: {1} being set to FAILED'.format( stale_task['id'], stale_task['name'] )) stale_task.update({ 'state': 'FAILED', 'error': { 'message': 'dispatcher process died', 'code': errno.EINTR, } }) dispatcher.datastore.update('tasks', stale_task['id'], stale_task) def create_initial_queues(self): self.resource_graph.add_resource(Resource('system')) def start_executors(self): for i in range(0, self.dispatcher.configstore.get('middleware.executors_count')): self.logger.info('Starting task executor #{0}...'.format(i)) self.executors.append(TaskExecutor(self, i)) def start(self): self.threads.append(gevent.spawn(self.distribution_thread)) self.logger.info("Started") def schema_to_list(self, schema): return { 'type': 'array', 'items': schema, 'minItems': sum([1 for x in schema if 'mandatory' in x and x['mandatory']]), 'maxItems': len(schema) } def verify_schema(self, clazz, args): if not hasattr(clazz, 'params_schema'): return [] schema = self.schema_to_list(clazz.params_schema) val = validator.DefaultDraft4Validator(schema, resolver=self.dispatcher.rpc.get_schema_resolver(schema)) return list(val.iter_errors(args)) def submit(self, name, args, sender, env=None): if name not in self.dispatcher.tasks: self.logger.warning("Cannot submit task: unknown task type %s", name) raise RpcException(errno.EINVAL, "Unknown task type {0}".format(name)) task = Task(self.dispatcher, name) task.user = sender.user.name task.session_id = sender.session_id task.created_at = datetime.utcnow() task.clazz = self.dispatcher.tasks[name] task.args = copy.deepcopy(args) if env: if not isinstance(env, dict): raise ValueError('env must be a dict') task.environment = copy.deepcopy(env) if self.debugger: for m in self.debugged_tasks: if fnmatch.fnmatch(name, m): task.debugger = self.debugger task.id = self.dispatcher.datastore.insert("tasks", task) task.set_state(TaskState.CREATED) self.task_queue.put(task) self.logger.info("Task %d submitted (type: %s, class: %s)", task.id, name, task.clazz) return task.id def verify_subtask(self, parent, name, args): clazz = self.dispatcher.tasks[name] instance = clazz(self.dispatcher, self.dispatcher.datastore) return instance.verify(*args) def run_subtask(self, parent, name, args): args = list(args) task = Task(self.dispatcher, name) task.created_at = datetime.utcnow() task.clazz = self.dispatcher.tasks[name] task.args = args task.instance = task.clazz(self.dispatcher, self.dispatcher.datastore) task.instance.verify(*task.args) task.id = self.dispatcher.datastore.insert("tasks", task) task.parent = parent if self.debugger: for m in self.debugged_tasks: if fnmatch.fnmatch(name, m): task.debugger = self.debugger task.set_state(TaskState.CREATED) self.task_list.append(task) # If we actually have a non `None` parent task then, add # the current subtask to the parent task's subtasks list too if parent is not None: parent.subtask_ids.append(task.id) task.start() return task def join_subtasks(self, *tasks): for i in tasks: i.join() def abort(self, id): task = self.get_task(id) if not task: self.logger.warning("Cannot abort task: unknown task id %d", id) return success = False if task.started_at is None: success = True else: try: task.executor.abort() # Also try to abort any subtasks that might have been running for st in task.subtask_ids: self.abort(st) except: pass if success: task.ended.set() task.set_state(TaskState.ABORTED, TaskStatus(0, "Aborted")) self.logger.debug("Task ID: %d, Name: %s aborted by user", task.id, task.name) def task_exited(self, task): self.resource_graph.release(*task.resources) self.schedule_tasks() def schedule_tasks(self): """ This function is called when: 1) any new task is submitted to any of the queues 2) any task exists :return: """ for task in [t for t in self.task_list if t.state == TaskState.WAITING]: if not self.resource_graph.can_acquire(*task.resources): continue self.resource_graph.acquire(*task.resources) self.threads.append(task.start()) def distribution_thread(self): while True: self.task_queue.peek() self.distribution_lock.acquire() task = self.task_queue.get() try: self.logger.debug("Picked up task %d: %s with args %s", task.id, task.name, task.args) errors = self.verify_schema(self.dispatcher.tasks[task.name], task.args) if len(errors) > 0: errors = list(validator.serialize_errors(errors)) self.logger.warning("Cannot submit task {0}: schema verification failed with errors {1}".format( task.name, errors )) raise ValidationException(extra=errors) task.instance = task.clazz(self.dispatcher, self.dispatcher.datastore) task.resources = task.instance.verify(*task.args) if type(task.resources) is not list: raise ValueError("verify() returned something else than resource list") except Exception as err: self.logger.warning("Cannot verify task %d: %s", task.id, err) task.set_state(TaskState.FAILED, TaskStatus(0), serialize_error(err)) self.task_list.append(task) task.ended.set() self.distribution_lock.release() if not isinstance(Exception, VerifyException): self.dispatcher.report_error('Task {0} verify() method raised invalid exception', err) continue task.set_state(TaskState.WAITING) self.task_list.append(task) self.distribution_lock.release() self.schedule_tasks() self.logger.debug("Task %d assigned to resources %s", task.id, ','.join(task.resources)) def assign_executor(self, task): for i in self.executors: if i.state == WorkerState.IDLE: i.checked_in.wait() self.logger.info("Task %d assigned to executor #%d", task.id, i.index) task.executor = i i.state = WorkerState.EXECUTING return # Out of executors! Need to spawn new one executor = TaskExecutor(self, len(self.executors)) self.executors.append(executor) executor.checked_in.wait() executor.state = WorkerState.EXECUTING task.executor = executor self.logger.info("Task %d assigned to executor #%d", task.id, executor.index) def dispose_executors(self): for i in self.executors: i.die() def get_active_tasks(self): return [x for x in self.task_list if x.state in ( TaskState.CREATED, TaskState.WAITING, TaskState.EXECUTING)] def get_tasks(self, type=None): if type is None: return self.task_list return [x for x in self.task_list if x.state == type] def get_task(self, id): self.distribution_lock.acquire() t = first_or_default(lambda x: x.id == id, self.task_list) if not t: t = first_or_default(lambda x: x.id == id, self.task_queue.queue) self.distribution_lock.release() return t def get_executor_by_key(self, key): return first_or_default(lambda t: t.key == key, self.executors) def get_executor_by_sender(self, sender): return first_or_default(lambda t: t.conn == sender, self.executors)
class ResourceGraph(object): def __init__(self): self.logger = logging.getLogger("ResourceGraph") self.mutex = RLock() self.root = Resource("root") self.resources = nx.DiGraph() self.resources.add_node(self.root) def lock(self): self.mutex.acquire() def unlock(self): self.mutex.release() @property def nodes(self): return self.resources.nodes() def add_resource(self, resource, parents=None): with self.mutex: if not resource: raise ResourceError("Invalid resource") if self.get_resource(resource.name): raise ResourceError("Resource {0} already exists".format(resource.name)) self.resources.add_node(resource) if not parents: parents = ["root"] for p in parents: node = self.get_resource(p) if not node: raise ResourceError("Invalid parent resource {0}".format(p)) self.resources.add_edge(node, resource) def remove_resource(self, name): with self.mutex: resource = self.get_resource(name) if not resource: return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) self.resources.remove_node(resource) def remove_resources(self, names): with self.mutex: for name in names: resource = self.get_resource(name) if not resource: return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) self.resources.remove_node(resource) def update_resource(self, name, new_parents): with self.mutex: resource = self.get_resource(name) if not resource: return for i in self.resources.predecessors(resource): self.resources.remove_edge(i, resource) for p in new_parents: node = self.get_resource(p) if not node: raise ResourceError("Invalid parent resource {0}".format(p)) self.resources.add_edge(node, resource) def get_resource(self, name): f = [i for i in self.resources.nodes() if i.name == name] return f[0] if len(f) > 0 else None def get_resource_dependencies(self, name): res = self.get_resource(name) for i, _ in self.resources.in_edges([res]): yield i.name def acquire(self, *names): if not names: return with self.mutex: self.logger.debug("Acquiring following resources: %s", ",".join(names)) for name in names: res = self.get_resource(name) if not res: raise ResourceError("Resource {0} not found".format(name)) for i in nx.descendants(self.resources, res): if i.busy: raise ResourceError("Cannot acquire, some of dependent resources are busy") res.busy = True def can_acquire(self, *names): if not names: return True with self.mutex: self.logger.log(TRACE, "Trying to acquire following resources: %s", ",".join(names)) for name in names: res = self.get_resource(name) if not res: return False if res.busy: return False for i in nx.descendants(self.resources, res): if i.busy: return False return True def release(self, *names): if not names: return with self.mutex: self.logger.debug("Releasing following resources: %s", ",".join(names)) for name in names: res = self.get_resource(name) res.busy = False
class HttpScannerOutput(object): def __init__(self, args): # TODO: make separate queues for fast logging self.args = args self.lock = RLock() # Colorama init init() # Initialise logging self._init_logger() # Initialise output self._init_output() # Stats self.urls_scanned = 0 def _init_output(self): # Initialise output self._init_requests_output() self._init_csv() self._init_json() self._init_dump() self._init_db() def _init_logger(self): """ Init logger :return: None """ if self.args.log_file is not None: self.logger = logging.getLogger('httpscan_logger') self.logger.setLevel(logging.DEBUG if self.args.debug else logging.INFO) handler = logging.FileHandler(self.args.log_file) handler.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%d.%m.%Y %H:%M:%S')) self.logger.addHandler(handler) else: self.logger = None def _init_requests_output(self): """ Init requests library output :return: None """ if self.args.debug: # Enable requests lib debug output HTTPConnection.debuglevel = 5 packages.urllib3.add_stderr_logger() logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) requests_log = logging.getLogger("requests.packages.urllib3") requests_log.setLevel(logging.DEBUG) requests_log.propagate = True else: # Surpress InsecureRequestWarning: Unverified HTTPS request is being made packages.urllib3.disable_warnings() def _init_csv(self): """ Initialise CSV output :return: """ if self.args.output_csv is None: self.csv = None else: # TODO: check if file exists self.csv = writer(open(self.args.output_csv, 'wb'), delimiter=';', quoting=QUOTE_ALL) self.csv.writerow(['url', 'status', 'length', 'headers']) def _init_json(self): """ Initialise JSON output :return: None """ self.json = None if self.args.output_json is None else io.open(self.args.output_json, 'w', encoding='utf-8') def _init_dump(self): """ Initialise dump folder :return: None """ self.dump = path.abspath(self.args.dump) if self.args.dump is not None else None if self.dump is not None and not path.exists(self.dump): makedirs(self.dump) def _init_db(self): """ Initialise database output. Create database and table if missing. :return: None """ if self.args.output_database is None: self.engine = None return # Check and create database if needed if not database_exists(self.args.output_database): create_database(self.args.output_database, encoding='utf8') # Create table self.engine = create_engine(self.args.output_database) self.metadata = MetaData() self.scan_table = Table('httpscan', self.metadata, Column('id', Integer, primary_key=True), Column('url', String), Column('status', Integer), Column('length', Integer), Column('headers', String) ) self.metadata.create_all(self.engine) def write(self, **kwargs): spawn(self.write_func, **kwargs) def write_func(self, **kwargs): # Acquire lock self.lock.acquire() # Output self._display_progress(**kwargs) self._write_log(**kwargs) # Check for exception if kwargs['exception'] is None: self._filter_and_write(**kwargs) # Realse lock self.lock.release() def _display_progress(self, **kwargs): # TODO: add detailed stats # Calculate progreess percentage = '{percent:.2%}'.format(percent=float(self.urls_scanned) / self.args.urls_count) # Generate and print colored output out = '[%s] [worker:%02i] [%s]\t%s -> status:%i ' % ( helper.str_now(), kwargs['worker'], percentage, kwargs['url'], kwargs['status']) if kwargs['exception'] is not None: out += 'error: (%s)' % str(kwargs['exception']) else: out += 'length: %s' % naturalsize(int(kwargs['length'])) if kwargs['status'] == 200: print(Fore.GREEN + out + Fore.RESET) elif 400 <= kwargs['status'] < 500 or kwargs['status'] == -1: print(Fore.RED + out + Fore.RESET) else: print(Fore.YELLOW + out + Fore.RESET) def _filter_and_write(self, **kwargs): # Filter responses and save responses that are matching ignore, allow rules if (self.args.allow is None and self.args.ignore is None) or \ (self.args.allow is not None and kwargs['status'] in self.args.allow) or \ (self.args.ignore is not None and kwargs['status'] not in self.args.ignore): self._write_csv(**kwargs) self._write_json(**kwargs) self._write_dump(**kwargs) self._write_db(**kwargs) def _kwargs_to_params(self, **kwargs): return {'url': kwargs['url'], 'status': kwargs['status'], 'length': kwargs['length'], 'headers': str(kwargs['response'].headers)} def _write_log(self, **kwargs): # Write to log file if self.logger is None: return out = '[worker:%02i] %s %s %i' % (kwargs['worker'], kwargs['url'], kwargs['status'], kwargs['length']) if kwargs['exception'] is None: self.logger.info(out) else: self.logger.error("%s %s" % (out, str(kwargs['exception']))) def _write_csv(self, **kwargs): if self.csv is not None: self.csv.writerow([kwargs['url'], kwargs['status'], kwargs['length'], str(kwargs['response'].headers)]) def _write_json(self, **kwargs): if self.json is None: return # TODO: bugfix appending json self.json.write(unicode(dumps(self._kwargs_to_params(**kwargs), ensure_ascii=False))) def _write_dump(self, **kwargs): if kwargs['response'] is None or self.dump is None: return # Generate folder and file path parsed = urlparse(kwargs['url']) host_folder = path.join(self.dump, parsed.netloc) p, f = path.split(parsed.path) folder = path.join(host_folder, p[1:]) if not path.exists(folder): makedirs(folder) filename = path.join(folder, f) # Get all content try: content = kwargs['response'].content except Exception as exception: self.write_log('Failed to get content for %s Exception: %s' % (kwargs['url'], str(exception))) return # Save contents to file with open(filename, 'wb') as f: f.write(content) def _write_db(self, **kwargs): if self.engine is None: return # TODO: check if url exists in table params = self._kwargs_to_params(**kwargs) self.engine.execute(self.scan_table.insert().execution_options(autocommit=True), params) def write_log(self, msg, loglevel=logging.INFO): """ Write message to log file :param msg: :param loglevel: :return: None """ if self.logger is None: return self.lock.acquire() if loglevel == logging.INFO: self.logger.info(msg) elif loglevel == logging.DEBUG: self.logger.debug(msg) elif loglevel == logging.ERROR: self.logger.error(msg) elif loglevel == logging.WARNING: self.logger.warning(msg) self.lock.release() def print_and_log(self, msg, loglevel=logging.INFO): # TODO: make separate logging print('[%s] %s' % (helper.str_now(), msg)) self.write_log(msg, loglevel)
class QueueHandler(object): def __init__(self, logger, pending_online_users, make_func, send_func): """Initialize Queue Handler :param logger: logger object :type logger: Logger :param pending_online_users: online users queue :type pending_online_users: gevent.queue :param make_func: the function to make bundle :type make_func: lambda,instancemethod,function :param send_func: the function to send bundle :type send_func: lambda,instancemethod,function """ self.alive = True self.last_idx = None self.logger = logger self.pending_online_users = pending_online_users self._pause_lock = RLock() self._make_func = make_func self._send_func = send_func#self._send_func #self.daemon = True #self.start() def shutdown(self): self.alive = False #put None to notify running thread gevent.killall(self.greenlets) def run(self): self.greenlets = [ gevent.spawn(self.main_loop), gevent.spawn(self.online_loop) ] def pause(self): self._pause_lock.acquire() def resume(self): self._pause_lock.release() @property def qsize(self): return self.bundle_queue.qsize() def main_loop(self): while True: self._pause_lock.acquire() # call DataMgr.make_bundle to make bundle of full m*n map # and pass function _send_func(GatewayMgr.send_push) as argument self._make_func(self._send_func) self._pause_lock.release() #TODO sleep longer #gevent.sleep(random.random()) gevent.sleep(MSG_CHECK_INTERV) def online_loop(self): while True: u = self.pending_online_users.get() # call DataMgr.make_bundle to make bundle of full m*1 map for specific user self._make_func(self._send_func, user_keys = [u]) # context switch gevent.sleep(0)
class ResourceGraph(object): def __init__(self): self.logger = logging.getLogger('ResourceGraph') self.mutex = RLock() self.root = Resource('root') self.resources = nx.DiGraph() self.resources.add_node(self.root) def lock(self): self.mutex.acquire() def unlock(self): self.mutex.release() @property def nodes(self): return self.resources.nodes() def add_resource(self, resource, parents=None, children=None): with self.mutex: if not resource: raise ResourceError('Invalid resource') if self.get_resource(resource.name): raise ResourceError('Resource {0} already exists'.format(resource.name)) self.resources.add_node(resource) if not parents: parents = ['root'] for p in parents: node = self.get_resource(p) if not node: continue self.resources.add_edge(node, resource) for p in children or []: node = self.get_resource(p) if not node: raise ResourceError('Invalid child resource {0}'.format(p)) def remove_resource(self, name): with self.mutex: resource = self.get_resource(name) if not resource: return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) self.resources.remove_node(resource) def remove_resources(self, names): with self.mutex: for name in names: resource = self.get_resource(name) if not resource: return for i in nx.descendants(self.resources, resource): self.resources.remove_node(i) self.resources.remove_node(resource) def rename_resource(self, oldname, newname): with self.mutex: resource = self.get_resource(oldname) if not resource: return resource.name = newname def update_resource(self, name, new_parents, new_children=None): with self.mutex: resource = self.get_resource(name) if not resource: return for i in self.resources.predecessors(resource): self.resources.remove_edge(i, resource) for p in new_parents: node = self.get_resource(p) if not node: continue self.resources.add_edge(node, resource) for p in new_children or []: node = self.get_resource(p) if not node: raise ResourceError('Invalid child resource {0}'.format(p)) self.resources.add_edge(resource, node) def get_resource(self, name): f = [i for i in self.resources.nodes() if i.name == name] return f[0] if len(f) > 0 else None def get_resource_dependencies(self, name): res = self.get_resource(name) for i, _ in self.resources.in_edges([res]): yield i.name def acquire(self, *names): if not names: return with self.mutex: self.logger.debug('Acquiring following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) if not res: raise ResourceError('Resource {0} not found'.format(name)) for i in nx.descendants(self.resources, res): if i.busy: raise ResourceError('Cannot acquire, some of dependent resources are busy') res.busy = True def can_acquire(self, *names): if not names: return True with self.mutex: self.logger.log(TRACE, 'Trying to acquire following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) if not res: return False if res.busy: return False for i in nx.descendants(self.resources, res): if i.busy: return False return True def release(self, *names): if not names: return with self.mutex: self.logger.debug('Releasing following resources: %s', ','.join(names)) for name in names: res = self.get_resource(name) res.busy = False def draw(self, path): return nx.write_dot(nx.relabel_nodes(self.resources, lambda n: f'"{n.name}"'), path)
def __init__(self): self.logger = logging.getLogger('ResourceGraph') self.mutex = RLock() self.root = Resource('root') self.resources = nx.DiGraph() self.resources.add_node(self.root)