class RedisListSink(BaseThreadedModule): """ Send events to a redis lists. list: Name of redis list to send data to. server: Redis server to connect to. port: Port redis server is listening on. db: Redis db. password: Redis password. format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send. store_interval_in_secs: Send data to redis in x seconds intervals. batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped. Configuration template: - RedisListSink: list: # <type: String; is: required> server: # <default: 'localhost'; type: string; is: optional> port: # <default: 6379; type: integer; is: optional> db: # <default: 0; type: integer; is: optional> password: # <default: None; type: None||string; is: optional> format: # <default: None; type: None||string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "output" """Set module type""" can_run_forked = True def configure(self, configuration): # Call parent configure method BaseThreadedModule.configure(self, configuration) self.format = self.getConfigurationValue('format') self.list = self.getConfigurationValue('list') self.client = redis.StrictRedis( host=self.getConfigurationValue('server'), port=self.getConfigurationValue('port'), password=self.getConfigurationValue('password'), db=self.getConfigurationValue('db')) try: self.client.ping() except: etype, evalue, etb = sys.exc_info() self.logger.error( "Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('server'), etype, evalue)) self.lumbermill.shutDown() def getStartMessage(self): return "[%s] on %s:%s. Max buffer size: %d" % ( self.list, self.getConfigurationValue('server'), self.getConfigurationValue('port'), self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) self.buffer = Buffer( self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def storeData(self, buffered_data): try: self.client.rpush(self.list, *buffered_data) return True except: exc_type, exc_value, exc_tb = sys.exc_info() self.logger.error( "Could not add event to redis list %s. Exception: %s, Error: %s." % (self.list, exc_type, exc_value)) return False def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event self.buffer.append(publish_data) yield None def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)
class ZabbixSink(BaseThreadedModule): """ Send events to zabbix. hostname: Hostname for which the metrics should be stored. fields: Event fields to send. field_prefix: Prefix to prepend to field names. For e.g. cpu_count field with default lumbermill_ prefix, the Zabbix key is lumbermill_cpu_count. timestamp_field: Field to provide timestamp. If not provided, current timestamp is used. agent_conf: Path to zabbix_agent configuration file. If set to True defaults to /etc/zabbix/zabbix_agentd.conf. server: Address of zabbix server. If port differs from default it can be set by appending it, e.g. 127.0.0.1:10052. store_interval_in_secs: sending data to es in x seconds intervals. batch_size: sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: maximum count of events waiting for transmission. Events above count will be dropped. Configuration template: - ZabbixSink: hostname: # <type: string; is: required> fields: # <type: list; is: required> field_prefix: # <default: "lumbermill_"; type: string; is: optional> timestamp_field: # <default: "timestamp"; type: string; is: optional> agent_conf: # <default: True; type: boolean||string; is: optional> server: # <default: False; type: boolean||string; is: required if agent_conf is False else optional> store_interval_in_secs: # <default: 10; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "output" """Set module type""" def configure(self, configuration): BaseThreadedModule.configure(self, configuration) self.hostname = self.getConfigurationValue("hostname") self.fields = self.getConfigurationValue("fields") self.field_prefix = self.getConfigurationValue("field_prefix") self.timestamp_field = self.getConfigurationValue("timestamp_field") self.batch_size = self.getConfigurationValue('batch_size') self.backlog_size = self.getConfigurationValue('backlog_size') self.agent_conf = self.getConfigurationValue("agent_conf") if self.agent_conf: if self.agent_conf is True: self.agent_conf = "/etc/zabbix/zabbix_agentd.conf" if not os.path.isfile(self.agent_conf): self.logger.error("%s does not point to an existing file." % self.agent_conf) self.lumbermill.shutDown() self.zabbix_sender = ZabbixSender(use_config=self.agent_conf) else: self.logger.error("asdads") server = self.getConfigurationValue("server") port = 10051 if ":" in self.server: server, port = self.server.split(":") self.zabbix_sender = ZabbixSender(zabbix_server=server, port=port) self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def getStartMessage(self): if self.agent_conf: return "Config: %s. Max buffer size: %d" % (self.agent_conf, self.getConfigurationValue('backlog_size')) else: return "Server: %s. Max buffer size: %d" % (self.getConfigurationValue("server"), self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def handleEvent(self, event): self.buffer.append(event) yield None def storeData(self, events): packet = [] for event in events: if self.timestamp_field: try: timestamp = event[self.timestamp_field] except KeyError: timestamp = None hostname = mapDynamicValue(self.hostname, mapping_dict=event, use_strftime=True) for field_name in self.fields: try: packet.append(ZabbixMetric(hostname, "%s%s" % (self.field_prefix, field_name), event[field_name], timestamp)) except KeyError: pass #self.logger.warning("Could not send metrics for %s:%s. Field not found." % (hostname, field_name)) response = self.zabbix_sender.send(packet) if response.failed != 0: self.logger.warning("%d of %d metrics were not processed correctly." % (response.total-response.processed, response.total)) def shutDown(self): self.buffer.flush()
class RedisListSink(BaseThreadedModule): """ Send events to a redis lists. list: Name of redis list to send data to. server: Redis server to connect to. port: Port redis server is listening on. db: Redis db. password: Redis password. format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send. store_interval_in_secs: Send data to redis in x seconds intervals. batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped. Configuration template: - RedisListSink: list: # <type: String; is: required> server: # <default: 'localhost'; type: string; is: optional> port: # <default: 6379; type: integer; is: optional> db: # <default: 0; type: integer; is: optional> password: # <default: None; type: None||string; is: optional> format: # <default: None; type: None||string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "output" """Set module type""" can_run_forked = True def configure(self, configuration): # Call parent configure method BaseThreadedModule.configure(self, configuration) self.format = self.getConfigurationValue('format') self.list = self.getConfigurationValue('list') self.client = redis.StrictRedis(host=self.getConfigurationValue('server'), port=self.getConfigurationValue('port'), password=self.getConfigurationValue('password'), db=self.getConfigurationValue('db')) try: self.client.ping() except: etype, evalue, etb = sys.exc_info() self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('server'),etype, evalue)) self.lumbermill.shutDown() def getStartMessage(self): return "publishing to %s:%s -> %s. Max buffer size: %d" % (self.getConfigurationValue('server'), self.getConfigurationValue('port'), self.list, self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def storeData(self, buffered_data): try: self.client.rpush(self.list, *buffered_data) return True except: exc_type, exc_value, exc_tb = sys.exc_info() self.logger.error("Could not add event to redis list %s. Exception: %s, Error: %s." % (self.list, exc_type, exc_value)) return False def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event self.buffer.append(publish_data) yield None def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)
class ZmqSink(BaseThreadedModule): """ Sends events to zeromq. server: Server to connect to. Pattern: hostname:port. pattern: Either push or pub. mode: Whether to run a server or client. If running as server, pool size is restricted to a single process. topic: The channels topic. hwm: Highwatermark for sending socket. format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send msgpacked. store_interval_in_secs: Send data to redis in x seconds intervals. batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped. Configuration template: - ZmqSink: server: # <default: 'localhost:5570'; type: string; is: optional> pattern: # <default: 'push'; type: string; values: ['push', 'pub']; is: optional> mode: # <default: 'connect'; type: string; values: ['connect', 'bind']; is: optional> topic: # <default: None; type: None||string; is: optional> hwm: # <default: None; type: None||integer; is: optional> format: # <default: None; type: None||string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "input" """Set module type""" can_run_forked = True def configure(self, configuration): # Call parent configure method BaseThreadedModule.configure(self, configuration) self.server = None self.topic = self.getConfigurationValue('topic') self.format = self.getConfigurationValue('format') self.mode = self.getConfigurationValue('mode') if self.mode == "bind": self.can_run_forked = False def initZmqContext(self): self.zmq_context = zmq.Context() if self.getConfigurationValue('pattern') == 'push': self.client = self.zmq_context.socket(zmq.PUSH) else: self.client = self.zmq_context.socket(zmq.PUB) if self.getConfigurationValue('hwm'): try: self.client.setsockopt(zmq.SNDHWM, self.getConfigurationValue('hwm')) except: self.client.setsockopt(zmq.HWM, self.getConfigurationValue('hwm')) server_name, server_port = self.getConfigurationValue('server').split(":") try: server_addr = socket.gethostbyname(server_name) except socket.gaierror: server_addr = server_name try: if self.getConfigurationValue('mode') == 'connect': self.client.connect('tcp://%s:%s' % (server_addr, server_port)) else: self.client.bind('tcp://%s:%s' % (server_addr, server_port)) except: etype, evalue, etb = sys.exc_info() self.logger.error("Could not connect to zeromq at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('server'), etype, evalue)) self.lumbermill.shutDown() def getStartMessage(self): return "%s. Max buffer size: %d" % (self.server, self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) self.initZmqContext() self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def storeData(self, buffered_data): try: for data in buffered_data: #print "Sending %s.\n" % data self.client.send("%s" % data) return True except zmq.error.ContextTerminated: pass except: exc_type, exc_value, exc_tb = sys.exc_info() if exc_value in ['Interrupted system call', 'Socket operation on non-socket']: return False self.logger.error("Could not add events to zmq. Exception: %s, Error: %s." % (exc_type, exc_value)) return False def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = msgpack.packb(event) if self.topic: publish_data = "%s %s" % (self.topic, publish_data) self.buffer.append(publish_data) yield None def shutDown(self): try: self.buffer.flush() except: pass try: self.client.close() self.zmq_context.term() except AttributeError: pass # Call parent shutDown method. BaseThreadedModule.shutDown(self)
class ElasticSearchSink(BaseThreadedModule): """ Store the data dictionary in an elasticsearch index. The elasticsearch module takes care of discovering all nodes of the elasticsearch cluster. Requests will the be loadbalanced via round robin. action: Either index or update. If update be sure to provide the correct doc_id. format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send. nodes: Configures the elasticsearch nodes. read_timeout: Set number of seconds to wait until requests to elasticsearch will time out. connection_type: One of: 'thrift', 'http'. http_auth: 'user:password'. use_ssl: One of: True, False. index_name: Sets the index name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here. doc_id: Sets the es document id for the committed event data. routing: Sets a routing value (@see: http://www.elasticsearch.org/blog/customizing-your-document-routing/) Timepatterns like %Y.%m.%d are allowed here. ttl: When set, documents will be automatically deleted after ttl expired. Can either set time in milliseconds or elasticsearch date format, e.g.: 1d, 15m etc. This feature needs to be enabled for the index. @See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-ttl-field.html sniff_on_start: The client can be configured to inspect the cluster state to get a list of nodes upon startup. Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this. sniff_on_connection_fail: The client can be configured to inspect the cluster state to get a list of nodes upon failure. Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this. consistency: One of: 'one', 'quorum', 'all'. store_interval_in_secs: Send data to es in x seconds intervals. batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed. Configuration template: - ElasticSearchSink: action: # <default: 'index'; type: string; is: optional; values: ['index', 'update']> format: # <default: None; type: None||string; is: optional> nodes: # <type: string||list; is: required> read_timeout: # <default: 10; type: integer; is: optional> connection_type: # <default: 'urllib3'; type: string; values: ['urllib3', 'requests']; is: optional> http_auth: # <default: None; type: None||string; is: optional> use_ssl: # <default: False; type: boolean; is: optional> index_name: # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional> doc_id: # <default: '$(lumbermill.event_id)'; type: string; is: optional> routing: # <default: None; type: None||string; is: optional> ttl: # <default: None; type: None||integer||string; is: optional> sniff_on_start: # <default: False; type: boolean; is: optional> sniff_on_connection_fail: # <default: False; type: boolean; is: optional> consistency: # <default: 'quorum'; type: string; values: ['one', 'quorum', 'all']; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "output" """Set module type""" def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) for module_name in ['elasticsearch', 'urllib3', 'requests']: if self.getConfigurationValue('log_level') == 'info': logging.getLogger(module_name).setLevel(logging.WARN) else: # Set log level for elasticsarch library if configured to other than default. logging.getLogger(module_name).setLevel(self.logger.level) self.action = self.getConfigurationValue('action') self.format = self.getConfigurationValue('format') self.consistency = self.getConfigurationValue("consistency") self.ttl = self.getConfigurationValue("ttl") self.index_name = self.getConfigurationValue("index_name") self.routing_pattern = self.getConfigurationValue("routing") self.doc_id_pattern = self.getConfigurationValue("doc_id") self.es_nodes = self.getConfigurationValue("nodes") self.read_timeout = self.getConfigurationValue("read_timeout") if not isinstance(self.es_nodes, list): self.es_nodes = [self.es_nodes] if self.getConfigurationValue("connection_type") == 'urllib3': self.connection_class = elasticsearch.connection.Urllib3HttpConnection elif self.getConfigurationValue("connection_type") == 'requests': self.connection_class = elasticsearch.connection.RequestsHttpConnection def getStartMessage(self): return "Idx: %s. Max buffer size: %d" % ( self.index_name, self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) # Init es client after fork as mentioned in https://elasticsearch-py.readthedocs.org/en/master/ self.es = self.connect() if not self.es: self.lumbermill.shutDown() return # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here. self.buffer = Buffer( self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def connect(self): es = False tries = 0 while tries < 5 and not es: try: # Connect to es node and round-robin between them. self.logger.debug("Connecting to %s." % self.es_nodes) es = elasticsearch.Elasticsearch( self.es_nodes, connection_class=self.connection_class, timeout=self.read_timeout, sniff_on_start=self.getConfigurationValue( 'sniff_on_start'), sniff_on_connection_fail=self.getConfigurationValue( 'sniff_on_connection_fail'), sniff_timeout=5, maxsize=20, use_ssl=self.getConfigurationValue('use_ssl'), http_auth=self.getConfigurationValue('http_auth')) except: etype, evalue, etb = sys.exc_info() self.logger.warning( "Connection to %s failed. Exception: %s, Error: %s." % (self.es_nodes, etype, evalue)) self.logger.warning( "Waiting %s seconds before retring to connect." % ((4 + tries))) time.sleep(4 + tries) tries += 1 continue if not es: self.logger.error("Connection to %s failed. Shutting down." % self.es_nodes) self.lumbermill.shutDown() else: self.logger.debug("Connection to %s successful." % self.es_nodes) return es def handleEvent(self, event): if self.format: publish_data = self.getConfigurationValue('format', event) else: publish_data = event self.buffer.append(publish_data) yield None def dataToElasticSearchJson(self, events): """ Format data for elasticsearch bulk update. """ json_data = [] for event in events: index_name = mapDynamicValueInString(self.index_name, event, use_strftime=True).lower() event_type = event['lumbermill'][ 'event_type'] if 'lumbermill' in event and 'event_type' in event[ 'lumbermill'] else 'Unknown' doc_id = mapDynamicValue(self.doc_id_pattern, event) routing = mapDynamicValue(self.routing_pattern, use_strftime=True) if not doc_id: self.logger.error( "Could not find doc_id %s for event %s." % (self.getConfigurationValue("doc_id"), event)) continue header = { self.action: { '_index': index_name, '_type': event_type, '_id': doc_id } } if self.routing_pattern: header['index']['_routing'] = routing if self.ttl: header['index']['_ttl'] = self.ttl if self.action == 'update': event = {'doc': event} try: json_data.append("\n".join( (json.dumps(header), json.dumps(event), "\n"))) except UnicodeDecodeError: etype, evalue, etb = sys.exc_info() self.logger.error( "Could not json encode %s. Exception: %s, Error: %s." % (event, etype, evalue)) json_data = "".join(json_data) return json_data def storeData(self, events): json_data = self.dataToElasticSearchJson(events) try: #started = time.time() # Bulk update of 500 events took 0.139621019363. self.es.bulk(body=json_data, consistency=self.consistency) #print("Bulk update of %s events took %s." % (len(events), time.time() - started)) return True except elasticsearch.exceptions.ConnectionError: try: self.logger.warning( "Lost connection to %s. Trying to reconnect." % (self.es_nodes, index_name)) self.es = self.connect() except: time.sleep(.5) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % json_data) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.es = self.connect() def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)
class SQSSink(BaseThreadedModule): """ Send messages to amazon sqs service. aws_access_key_id: Your AWS id. aws_secret_access_key: Your AWS password. region: The region in which to find your sqs service. queue: Queue name. format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set event.data will be send es MessageBody, all other fields will be send as MessageAttributes. store_interval_in_secs: Send data to redis in x seconds intervals. batch_size: Number of messages to collect before starting to send messages to sqs. This refers to the internal receive buffer of this plugin. When the receive buffer is maxed out, this plugin will always send the maximum of 10 messages in one send_message_batch call. backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped. values: ['us-east-1', 'us-west-1', 'us-west-2', 'eu-central-1', 'eu-west-1', 'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'sa-east-1', 'us-gov-west-1', 'cn-north-1'] Configuration template: - SQSSink: aws_access_key_id: # <type: string; is: required> aws_secret_access_key: # <type: string; is: required> region: # <type: string; is: required> queue: # <type: string; is: required> format: # <default: None; type: None||string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> receivers: - NextModule """ module_type = "output" """Set module type""" def configure(self, configuration): # Call parent configure method BaseThreadedModule.configure(self, configuration) # Set boto log level. logging.getLogger('boto3').setLevel(logging.CRITICAL) logging.getLogger('botocore').setLevel(logging.CRITICAL) self.batch_size = self.getConfigurationValue('batch_size') self.format = self.getConfigurationValue('format') def getStartMessage(self): return "Queue: %s [%s]. Max buffer size: %d" % ( self.getConfigurationValue('queue'), self.getConfigurationValue('region'), self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) self.buffer = Buffer( self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) try: self.sqs_resource = boto3.resource( 'sqs', region_name=self.getConfigurationValue('region'), api_version=None, use_ssl=True, verify=None, endpoint_url=None, aws_access_key_id=self.getConfigurationValue( 'aws_access_key_id'), aws_secret_access_key=self.getConfigurationValue( 'aws_secret_access_key'), aws_session_token=None, config=None) self.sqs_queue = self.sqs_resource.get_queue_by_name( QueueName=self.getConfigurationValue('queue')) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Could not connect to sqs service. Exception: %s, Error: %s." % (etype, evalue)) self.lumbermill.shutDown() def handleEvent(self, event): self.buffer.append(event) yield None def storeData(self, buffered_data): batch_messages = [] for event in buffered_data: try: id = event['lumbermill']['event_id'] except KeyError: id = "%032x%s" % (random.getrandbits(128), os.getpid()) message = {'Id': id} if self.format: event = mapDynamicValue(self.format, event) else: try: event = json.dumps(event) except: etype, evalue, etb = sys.exc_info() self.logger.warning( "Error while encoding event data: %s to json. Exception: %s, Error: %s." % (event, etype, evalue)) message['MessageBody'] = event batch_messages.append(message) if len(batch_messages) % 10: self.sqs_queue.send_messages(Entries=batch_messages) batch_messages = [] if len(batch_messages) > 0: self.send() def shutDown(self): self.buffer.flush()
class MongoDbSink(BaseThreadedModule): """ Store incoming events in a mongodb. host: Mongodb server. database: Mongodb database name. collection: Mongodb collection name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here. optinonal_connection_params: Other optional parameters as documented in https://api.mongodb.org/python/current/api/pymongo/mongo_client.html format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send. doc_id: Sets the document id for the committed event data. store_interval_in_secs: Send data to es in x seconds intervals. batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed. Configuration template: - MongoDbSink: host: # <default: 'localhost:27017'; type: string; is: optional> database: # <default: 'lumbermill'; type: string; is: optional> collection: # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional> optinonal_connection_params: # <default: {'serverSelectionTimeoutMS': 5}; type: dictionary; is: optional> format: # <default: None; type: None||string; is: optional> doc_id: # <default: '$(lumbermill.event_id)'; type: string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 5000; type: integer; is: optional> """ module_type = "output" """Set module type""" def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) self.format = self.getConfigurationValue('format') self.collection = self.getConfigurationValue('collection') self.database = self.getConfigurationValue('database') self.doc_id_pattern = self.getConfigurationValue("doc_id") def getStartMessage(self): return "DB: %s. Max buffer size: %d" % (self.getConfigurationValue( 'database'), self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) # Init monogdb client after fork. self.mongodb = self.connect() if not self.mongodb: self.lumbermill.shutDown() return # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here. self.buffer = Buffer( self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def connect(self): try: mongodb_client = pymongo.MongoClient( self.getConfigurationValue('host'), **self.getConfigurationValue('optinonal_connection_params')) self.logger.debug(str(mongodb_client.server_info())) except: etype, evalue, etb = sys.exc_info() self.logger.warning( "Connection to %s failed. Exception: %s, Error: %s." % (self.getConfigurationValue('host'), etype, evalue)) if not mongodb_client: self.logger.error("Connection to %s failed. Shutting down." % self.getConfigurationValue('host')) self.lumbermill.shutDown() else: self.logger.debug("Connection to %s successful." % self.getConfigurationValue('host')) return mongodb_client def handleEvent(self, event): if self.format: publish_data = self.getConfigurationValue('format', event) else: publish_data = event self.buffer.append(publish_data) yield None def storeData(self, events): mongo_db = self.mongodb[self.database] bulk_objects = {} for event in events: collection_name = mapDynamicValueInString( self.collection, event, use_strftime=True).lower() doc_id = mapDynamicValue(self.doc_id_pattern, event) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.doc_id_pattern, event)) continue event['_id'] = doc_id if collection_name not in bulk_objects.keys(): bulk_objects[collection_name] = mongo_db[ collection_name].initialize_ordered_bulk_op() try: bulk_objects[collection_name].insert(event) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % event) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.mongodb = self.connect() for collection_name, bulk_object in bulk_objects.iteritems(): try: result = bulk_object.execute() self.logger.debug(str(result)) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Server communication error. Exception: %s, Error: %s." % (etype, evalue)) def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)
class MongoDbSink(BaseThreadedModule): """ Store incoming events in a mongodb. host: Mongodb server. database: Mongodb database name. collection: Mongodb collection name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here. optinonal_connection_params: Other optional parameters as documented in https://api.mongodb.org/python/current/api/pymongo/mongo_client.html format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send. doc_id: Sets the document id for the committed event data. store_interval_in_secs: Send data to es in x seconds intervals. batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed. Configuration template: - MongoDbSink: host: # <default: 'localhost:27017'; type: string; is: optional> database: # <default: 'lumbermill'; type: string; is: optional> collection: # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional> optinonal_connection_params: # <default: {'serverSelectionTimeoutMS': 5}; type: dictionary; is: optional> format: # <default: None; type: None||string; is: optional> doc_id: # <default: '$(lumbermill.event_id)'; type: string; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 5000; type: integer; is: optional> """ module_type = "output" """Set module type""" def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) self.format = self.getConfigurationValue('format') self.collection = self.getConfigurationValue('collection') self.database = self.getConfigurationValue('database') self.doc_id_pattern = self.getConfigurationValue("doc_id") def getStartMessage(self): return "DB: %s. Max buffer size: %d" % (self.getConfigurationValue('database'), self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) # Init monogdb client after fork. self.mongodb = self.connect() if not self.mongodb: self.lumbermill.shutDown() return # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here. self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def connect(self): try: mongodb_client = pymongo.MongoClient(self.getConfigurationValue('host'), **self.getConfigurationValue('optinonal_connection_params')) self.logger.debug(str(mongodb_client.server_info())) except: etype, evalue, etb = sys.exc_info() self.logger.warning("Connection to %s failed. Exception: %s, Error: %s." % (self.getConfigurationValue('host'), etype, evalue)) if not mongodb_client: self.logger.error("Connection to %s failed. Shutting down." % self.getConfigurationValue('host')) self.lumbermill.shutDown() else: self.logger.debug("Connection to %s successful." % self.getConfigurationValue('host')) return mongodb_client def handleEvent(self, event): if self.format: publish_data = self.getConfigurationValue('format', event) else: publish_data = event self.buffer.append(publish_data) yield None def storeData(self, events): mongo_db = self.mongodb[self.database] bulk_objects = {} for event in events: collection_name = mapDynamicValueInString(self.collection, event, use_strftime=True).lower() doc_id = mapDynamicValue(self.doc_id_pattern, event) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.doc_id_pattern, event)) continue event['_id'] = doc_id if collection_name not in bulk_objects.keys(): bulk_objects[collection_name] = mongo_db[collection_name].initialize_ordered_bulk_op() try: bulk_objects[collection_name].insert(event) except: etype, evalue, etb = sys.exc_info() self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % event) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.mongodb = self.connect() for collection_name, bulk_object in bulk_objects.iteritems(): try: result = bulk_object.execute() self.logger.debug(str(result)) except: etype, evalue, etb = sys.exc_info() self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue)) def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)
class FileSink(BaseThreadedModule): """ Store all received events in a file. file_name: absolute path to filen. String my contain pythons strtime directives and event fields, e.g. %Y-%m-%d. format: Which event fields to use in the logline, e.g. '$(@timestamp) - $(url) - $(country_code)' store_interval_in_secs: sending data to es in x seconds intervals. batch_size: sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: maximum count of events waiting for transmission. Events above count will be dropped. compress: Compress output as gzip or snappy file. For this to be effective, the chunk size should not be too small. Configuration template: - FileSink: file_name: # <type: string; is: required> format: # <default: '$(data)'; type: string; is: optional> store_interval_in_secs: # <default: 10; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> compress: # <default: None; type: None||string; values: [None,'gzip','snappy']; is: optional> """ module_type = "output" """Set module type""" can_run_forked = False def configure(self, configuration): # Call parent configure method BaseThreadedModule.configure(self, configuration) self.batch_size = self.getConfigurationValue('batch_size') self.backlog_size = self.getConfigurationValue('backlog_size') self.file_name = self.getConfigurationValue('file_name') self.format = self.getConfigurationValue('format') self.compress = self.getConfigurationValue('compress') self.file_handles = {} if self.compress == 'gzip': try: # Import module into namespace of object. Otherwise it will not be accessible when process was forked. self.gzip_module = __import__('gzip') except ImportError: self.logger.error( 'Gzip compression selected but gzip module could not be loaded.' ) self.lumbermill.shutDown() if self.compress == 'snappy': try: self.snappy_module = __import__('snappy') except ImportError: self.logger.error( 'Snappy compression selected but snappy module could not be loaded.' ) self.lumbermill.shutDown() self.buffer = Buffer( self.batch_size, self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.backlog_size) TimedFunctionManager.startTimedFunction(self.closeStaleFileHandles) def getStartMessage(self): return "File: %s. Max buffer size: %d" % ( self.file_name, self.getConfigurationValue('backlog_size')) @setInterval(60) def closeStaleFileHandles(self): """ Close and delete file handles that are unused since 5 minutes. """ for path, file_handle_data in self.file_handles.items(): last_used_time_ago = time.time() - file_handle_data['lru'] if last_used_time_ago < 300: continue self.logger.info('Closing stale file handle for %s.' % (path)) file_handle_data['handle'].close() self.file_handles.pop(path) def closeAllFileHandles(self): for path, file_handle_data in self.file_handles.items(): self.logger.info('Closing file handle for %s.' % path) file_handle_data['handle'].close() self.file_handles.pop(path) def ensurePathExists(self, path): dirpath = os.path.dirname(path) if not os.path.exists(dirpath): os.makedirs(dirpath) def handleEvent(self, event): self.buffer.append(event) yield None def getOrCreateFileHandle(self, path, mode): file_handle = None try: file_handle = self.file_handles[path]['handle'] self.file_handles[path]['lru'] = time.time() except KeyError: try: file_handle = open(path, mode) self.file_handles[path] = { 'handle': file_handle, 'lru': time.time() } except: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no open %s for writing. Exception: %s, Error: %s.' % (path, etype, evalue)) return file_handle def storeData(self, events): write_data = collections.defaultdict(str) for event in events: path = mapDynamicValue(self.file_name, mapping_dict=event, use_strftime=True) line = mapDynamicValue(self.format, mapping_dict=event) write_data["%s" % path] += line + "\n" for path, lines in write_data.items(): try: self.ensurePathExists(path) except: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no create path %s. Events could not be written. Exception: %s, Error: %s.' % (path, etype, evalue)) return mode = "a+" if self.compress == 'gzip': path += ".gz" mode += "b" lines = self.compressGzip(lines) elif self.compress == 'snappy': path += ".snappy" lines = self.compressSnappy(lines) mode += "b" try: fh = self.getOrCreateFileHandle(path, mode) fh.write(lines) fh.flush() return True except: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no write event data to %s. Exception: %s, Error: %s.' % (path, etype, evalue)) def shutDown(self): self.buffer.flush() self.closeAllFileHandles() BaseThreadedModule.shutDown(self) def compressGzip(self, data): buffer = StringIO() compressor = self.gzip_module.GzipFile(mode='wb', fileobj=buffer) try: compressor.write(data) finally: compressor.close() return buffer.getvalue() def compressSnappy(self, data): return self.snappy_module.compress(data)
class ElasticSearchSink(BaseThreadedModule): """ Store the data dictionary in an elasticsearch index. The elasticsearch module takes care of discovering all nodes of the elasticsearch cluster. Requests will the be loadbalanced via round robin. action: Either index or update. If update be sure to provide the correct doc_id. fields: Which event fields to send on, e.g. [timestamp, url, country_code]. If not set the whole event dict is send. nodes: Configures the elasticsearch nodes. read_timeout: Set number of seconds to wait until requests to elasticsearch will time out. connection_type: One of: 'thrift', 'http'. http_auth: 'user:password'. use_ssl: One of: True, False. index_name: Sets the index name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here. doc_id: Sets the es document id for the committed event data. routing: Sets a routing value (@see: http://www.elasticsearch.org/blog/customizing-your-document-routing/) Timepatterns like %Y.%m.%d are allowed here. ttl: When set, documents will be automatically deleted after ttl expired. Can either set time in milliseconds or elasticsearch date format, e.g.: 1d, 15m etc. This feature needs to be enabled for the index. @See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-ttl-field.html sniff_on_start: The client can be configured to inspect the cluster state to get a list of nodes upon startup. Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this. sniff_on_connection_fail: The client can be configured to inspect the cluster state to get a list of nodes upon failure. Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this. store_interval_in_secs: Send data to es in x seconds intervals. batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached. backlog_size: Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed. Configuration template: - ElasticSearchSink: action: # <default: 'index'; type: string; is: optional; values: ['index', 'update']> fields: # <default: None; type: None||list; is: optional> nodes: # <type: string||list; is: required> read_timeout: # <default: 10; type: integer; is: optional> connection_type: # <default: 'urllib3'; type: string; values: ['urllib3', 'requests']; is: optional> http_auth: # <default: None; type: None||string; is: optional> use_ssl: # <default: False; type: boolean; is: optional> index_name: # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional> doc_id: # <default: '$(lumbermill.event_id)'; type: string; is: optional> doc_type: # <default: '$(lumbermill.event_type)'; type: string; is: optional> routing: # <default: None; type: None||string; is: optional> ttl: # <default: None; type: None||integer||string; is: optional> sniff_on_start: # <default: False; type: boolean; is: optional> sniff_on_connection_fail: # <default: False; type: boolean; is: optional> store_interval_in_secs: # <default: 5; type: integer; is: optional> batch_size: # <default: 500; type: integer; is: optional> backlog_size: # <default: 500; type: integer; is: optional> """ module_type = "output" """Set module type""" def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) for module_name in ['elasticsearch', 'urllib3', 'requests']: if self.getConfigurationValue('log_level') == 'info': logging.getLogger(module_name).setLevel(logging.WARN) else: # Set log level for elasticsarch library if configured to other than default. logging.getLogger(module_name).setLevel(self.logger.level) self.action = self.getConfigurationValue('action') self.fields = self.getConfigurationValue('fields') self.ttl = self.getConfigurationValue("ttl") self.index_name = self.getConfigurationValue("index_name") self.routing_pattern = self.getConfigurationValue("routing") self.doc_id_pattern = self.getConfigurationValue("doc_id") self.doc_type_pattern = self.getConfigurationValue("doc_type") self.doc_type_is_dynamic = self.isDynamicConfigurationValue("doc_type") self.es_nodes = self.getConfigurationValue("nodes") self.read_timeout = self.getConfigurationValue("read_timeout") if not isinstance(self.es_nodes, list): self.es_nodes = [self.es_nodes] if self.getConfigurationValue("connection_type") == 'urllib3': self.connection_class = elasticsearch.connection.Urllib3HttpConnection elif self.getConfigurationValue("connection_type") == 'requests': self.connection_class = elasticsearch.connection.RequestsHttpConnection def getStartMessage(self): return "Idx: %s. Max buffer size: %d" % (self.index_name, self.getConfigurationValue('backlog_size')) def initAfterFork(self): BaseThreadedModule.initAfterFork(self) # Init es client after fork as mentioned in https://elasticsearch-py.readthedocs.org/en/master/ self.es = self.connect() if not self.es: self.lumbermill.shutDown() return # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here. self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size')) def connect(self): es = False tries = 0 while tries < 5 and not es: try: # Connect to es node and round-robin between them. self.logger.debug("Connecting to %s." % self.es_nodes) es = elasticsearch.Elasticsearch(self.es_nodes, connection_class=self.connection_class, timeout=self.read_timeout, sniff_on_start=self.getConfigurationValue('sniff_on_start'), sniff_on_connection_fail=self.getConfigurationValue('sniff_on_connection_fail'), sniff_timeout=5, maxsize=20, use_ssl=self.getConfigurationValue('use_ssl'), http_auth=self.getConfigurationValue('http_auth')) except: etype, evalue, etb = sys.exc_info() self.logger.warning("Connection to %s failed. Exception: %s, Error: %s." % (self.es_nodes, etype, evalue)) self.logger.warning("Waiting %s seconds before retring to connect." % ((4 + tries))) time.sleep(4 + tries) tries += 1 continue if not es: self.logger.error("Connection to %s failed. Shutting down." % self.es_nodes) self.lumbermill.shutDown() else: self.logger.debug("Connection to %s successful." % self.es_nodes) return es def handleEvent(self, event): if self.fields: publish_data = {} for field in self.fields: try: publish_data.update(event[field]) except KeyError: continue else: publish_data = event self.buffer.append(publish_data) yield None def dataToElasticSearchJson(self, events): """ Format data for elasticsearch bulk update. """ json_data = [] for event in events: index_name = mapDynamicValueInString(self.index_name, event, use_strftime=True).lower() doc_type = mapDynamicValueInString(self.doc_type_pattern, event) doc_id = mapDynamicValueInString(self.doc_id_pattern, event) routing = mapDynamicValue(self.routing_pattern, use_strftime=True) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.getConfigurationValue("doc_id"), event)) continue header = {self.action: {'_index': index_name, '_type': doc_type, '_id': doc_id}} if self.routing_pattern: header['index']['_routing'] = routing if self.ttl: header['index']['_ttl'] = self.ttl if self.action == 'update': event = {'doc': event} try: json_data.append("\n".join((json.dumps(header), json.dumps(event), "\n"))) except UnicodeDecodeError: etype, evalue, etb = sys.exc_info() self.logger.error("Could not json encode %s. Exception: %s, Error: %s." % (event, etype, evalue)) json_data = "".join(json_data) return json_data def storeData(self, events): json_data = self.dataToElasticSearchJson(events) try: #started = time.time() # Bulk update of 500 events took 0.139621019363. self.es.bulk(body=json_data) #print("Bulk update of %s events took %s." % (len(events), time.time() - started)) return True except elasticsearch.exceptions.ConnectionError: try: self.logger.warning("Lost connection to %s. Trying to reconnect." % (self.es_nodes, self.index_name)) self.es = self.connect() except: time.sleep(.5) except: etype, evalue, etb = sys.exc_info() self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % json_data) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.es = self.connect() def shutDown(self): try: self.buffer.flush() except: pass BaseThreadedModule.shutDown(self)