def __init__(self, configuration): self.config = configuration self.input = None self.parser = None self.metadata = None # Input is required arguments = {} if 'arguments' in self.config['input']: arguments = self.config['input']['arguments'] if self.config['input']['type'] == 'command': self.input = CommandRunner(**arguments) elif self.config['input']['type'] == 'file': self.input = FileReader(**arguments) elif self.config['input']['type'] == 'http': self.input = HTTPReader(**arguments) elif self.config['input']['type'] == 'class': self.input = init_object(self.config['input']['name'], **arguments) elif self.config['input']['type'] == 'tailer': if 'tailer' in config: self.input = Tailer(config['tailer']) else: raise AttributeError( "Missing tailer in config file for tailer type input") assert self.input # parser is optional for parsing data collected by input if 'parser' in self.config: arguments = {} if 'arguments' in self.config['parser']: arguments = self.config['parser']['arguments'] if self.config['parser']['type'] == 'match': self.parser = MatchParser( self.config['parser']['pattern'].strip(), self.config['parser']['transform'].strip()) elif self.config['parser']['type'] == 'split': self.parser = SplitParser( self.config['parser']['delimiter'].strip(), self.config['parser']['transform'].strip()) elif self.config['parser']['type'] == 'dummy': self.parser = DummyParser() elif self.config['parser']['type'] == 'json': self.parser = JsonGrepParser(**arguments) elif self.config['parser']['type'] == 'class': self.parser = init_object(self.config['parser']['name'], **arguments) self._max_error_count = self.config['input'].get('max_error_count', -1) self._current_data = None self._number_collected = 0 self._number_failed = 0 self._error_count = 0 self._output = create_output(config['output']) if 'metadata' in self.config: self.metadata = self.config['metadata']
def __init__(self, collector_name, config, output, tailer): threading.Thread.__init__(self, name=collector_name) self.__collector_name=collector_name self.__config=config self.__sleep_time=self.__config['input'].get('frequency',10) self.__cron=self.__config['input'].get('schedule',None) self.__schedule=None if self.__cron is not None: self.__schedule=CronEvent(self.__cron) log.debug("job scheduled at %s"%self.__schedule.numerical_tab) self.__input=None self.__parser=None self.__output=output if self.__config['input']['type']=='command': self.__input=CommandRunner(self.__config['input']['source']) elif self.__config['input']['type']=='file': self.__input=FileReader(self.__config['input']['path']) elif self.__config['input']['type']=='http': #log.debug('input %s'%self.__config['input']) url=self.__config['input']['url'] headers=self.__config['input'].get('headers', {}) #log.debug('headers %s'%headers) auth=self.__config['input'].get('auth', None) self.__input=HTTPReader(url, headers, auth) elif self.__config['input']['type']=='class': arguments={} if 'arguments' in self.__config['input']: arguments=self.__config['input']['arguments'] self.__input=init_object(self.__config['input']['name'], **arguments) elif self.__config['input']['type']=='tailer': self.__input=tailer if 'parser' in self.__config: if self.__config['parser']['type']=='match': self.__parser=MatchParser(self.__config['parser']['pattern'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='split': self.__parser=SplitParser(self.__config['parser']['delimiter'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='dummy': self.__parser=DummyParser() elif self.__config['parser']['type']=='json': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=JsonGrepParser(**arguments) elif self.__config['parser']['type']=='class': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=init_object(self.__config['parser']['name'], **arguments) self.__running=True self.__session_id=str(uuid.uuid4()) self.__max_error_count=self.__config['input'].get('max_error_count', -1) self.__current_data=None self.__number_collected=0 self.__number_failed=0 self.__sleep_count=0 self.__error_count=0 self.__last_check_minute=-1
def test_case3(self): parser=JsonGrepParser(pattern=". chargebackData", list_name="hcp-chargeback") input= '[{"chargebackData": 1}, {"chargebackData": 2}]' output=parser.parse(input) #print output self.assertTrue('timestamp' in output) self.assertTrue('hostname' in output) self.assertTrue('hcp-chargeback' in output) self.assertTrue(isinstance(output["hcp-chargeback"], list)) self.assertTrue(len(output["hcp-chargeback"])==2) self.assertTrue(output["hcp-chargeback"][0]==1) self.assertTrue(output["hcp-chargeback"][1]==2)
def test_case2(self): parser=JsonGrepParser(pattern="chargebackData", list_name="hcp-chargeback") input= '{"chargebackData":["chargebackData1","chargebackData2"]}' output=parser.parse(input) #print output self.assertTrue('timestamp' in output) self.assertTrue('hostname' in output) self.assertTrue('hcp-chargeback' in output) self.assertTrue(isinstance(output["hcp-chargeback"], list)) self.assertTrue(len(output["hcp-chargeback"])==2) self.assertTrue(output["hcp-chargeback"][0]=='chargebackData1') self.assertTrue(output["hcp-chargeback"][1]=='chargebackData2')
def test_case1(self): parser=JsonGrepParser(pattern="chargebackData", list_name="hcp-chargeback") input= '{"chargebackData":[{"systemName":"hcp1.s3.ersa.edu.au"}, {"systemName":"hcp2.s3.ersa.edu.au"}]}' output=parser.parse(input) #print output self.assertTrue('timestamp' in output) self.assertTrue('hostname' in output) self.assertTrue('hcp-chargeback' in output) self.assertTrue(isinstance(output["hcp-chargeback"], list)) self.assertTrue(len(output["hcp-chargeback"])==2) self.assertTrue('systemName' in output["hcp-chargeback"][0]) self.assertTrue(output["hcp-chargeback"][0]['systemName']=='hcp1.s3.ersa.edu.au') self.assertTrue('systemName' in output["hcp-chargeback"][1]) self.assertTrue(output["hcp-chargeback"][1]['systemName']=='hcp2.s3.ersa.edu.au')
def __init__(self, collector_name, config, output, tailer=None): threading.Thread.__init__(self, name=collector_name) self.__collector_name=collector_name self.__config=config self.__sleep_time=self.__config['input'].get('frequency',10) self.__cron=self.__config['input'].get('schedule',None) self.__schedule=None if self.__cron is not None: self.__schedule=CronEvent(self.__cron) log.debug("job scheduled at %s"%self.__schedule.numerical_tab) self.__input=None self.__parser=None self.__output=output if self.__config['input']['type']=='command': self.__input=CommandRunner(self.__config['input']['source']) elif self.__config['input']['type']=='file': self.__input=FileReader(self.__config['input']['path']) elif self.__config['input']['type']=='http': #log.debug('input %s'%self.__config['input']) url=self.__config['input']['url'] headers=self.__config['input'].get('headers', {}) #log.debug('headers %s'%headers) auth=self.__config['input'].get('auth', None) self.__input=HTTPReader(url, headers, auth) elif self.__config['input']['type']=='class': arguments={} if 'arguments' in self.__config['input']: arguments=self.__config['input']['arguments'] self.__input=init_object(self.__config['input']['name'], **arguments) elif self.__config['input']['type']=='tailer': if tailer is None: raise AttributeError("Missing tailer in config file for tailer type input") self.__input=tailer assert(self.__input) if 'parser' in self.__config: if self.__config['parser']['type']=='match': self.__parser=MatchParser(self.__config['parser']['pattern'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='split': self.__parser=SplitParser(self.__config['parser']['delimiter'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='dummy': self.__parser=DummyParser() elif self.__config['parser']['type']=='json': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=JsonGrepParser(**arguments) elif self.__config['parser']['type']=='class': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=init_object(self.__config['parser']['name'], **arguments) self.__running=True self.__session_id=str(uuid.uuid4()) self.__max_error_count=self.__config['input'].get('max_error_count', -1) self.__current_data=None self.__number_collected=0 self.__number_failed=0 self.__sleep_count=0 self.__error_count=0 self.__last_check_minute=-1
class Collector(threading.Thread): def __init__(self, collector_name, config, output, tailer=None): threading.Thread.__init__(self, name=collector_name) self.__collector_name=collector_name self.__config=config self.__sleep_time=self.__config['input'].get('frequency',10) self.__cron=self.__config['input'].get('schedule',None) self.__schedule=None if self.__cron is not None: self.__schedule=CronEvent(self.__cron) log.debug("job scheduled at %s"%self.__schedule.numerical_tab) self.__input=None self.__parser=None self.__output=output if self.__config['input']['type']=='command': self.__input=CommandRunner(self.__config['input']['source']) elif self.__config['input']['type']=='file': self.__input=FileReader(self.__config['input']['path']) elif self.__config['input']['type']=='http': #log.debug('input %s'%self.__config['input']) url=self.__config['input']['url'] headers=self.__config['input'].get('headers', {}) #log.debug('headers %s'%headers) auth=self.__config['input'].get('auth', None) self.__input=HTTPReader(url, headers, auth) elif self.__config['input']['type']=='class': arguments={} if 'arguments' in self.__config['input']: arguments=self.__config['input']['arguments'] self.__input=init_object(self.__config['input']['name'], **arguments) elif self.__config['input']['type']=='tailer': if tailer is None: raise AttributeError("Missing tailer in config file for tailer type input") self.__input=tailer assert(self.__input) if 'parser' in self.__config: if self.__config['parser']['type']=='match': self.__parser=MatchParser(self.__config['parser']['pattern'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='split': self.__parser=SplitParser(self.__config['parser']['delimiter'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='dummy': self.__parser=DummyParser() elif self.__config['parser']['type']=='json': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=JsonGrepParser(**arguments) elif self.__config['parser']['type']=='class': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=init_object(self.__config['parser']['name'], **arguments) self.__running=True self.__session_id=str(uuid.uuid4()) self.__max_error_count=self.__config['input'].get('max_error_count', -1) self.__current_data=None self.__number_collected=0 self.__number_failed=0 self.__sleep_count=0 self.__error_count=0 self.__last_check_minute=-1 def quit(self): self.__running=False def info(self): col_info={"name":self.__collector_name, "config":self.__config, "sleep_time": self.__sleep_time} col_info["session_id"]=self.__session_id col_info["is_running"]=self.__running col_info["current_data"]=self.__current_data col_info["number_collected"]=self.__number_collected col_info["number_failed"]=self.__number_failed col_info["sleep_count"]=self.__sleep_count col_info["error_count"]=self.__error_count col_info["max_error_count"]=self.__max_error_count if self.__cron is not None: col_info["cron"]=self.__cron if self.__config['input']['type']=='tailer': col_info["tailer"]=self.__input.info(self.__config['input']['path']) return col_info def match_time(self): """Return True if this event should trigger at the specified datetime""" if self.__schedule is None: return False t=datetime.datetime.now() if t.minute==self.__last_check_minute: return False self.__last_check_minute=t.minute log.debug("check if cron job can be triggered. %d"%self.__last_check_minute) return self.__schedule.check_trigger((t.year,t.month,t.day,t.hour,t.minute)) def run(self): count = self.__sleep_time error_count = 0 log.info("Collector %s has started.", self.__collector_name) while self.__running: args = {'config': self.__config['input']} if (self.__schedule is None and count == self.__sleep_time) or self.match_time(): log.debug("Starting to collect data.") count = 0 data = None no_msgs = 1 try: data = self.__input.get_data(**args) if isinstance(data, collections.deque) or isinstance(data, list): self.__current_data = [l.decode('ASCII', 'ignore') for l in data] payload = [] no_msgs = len(data) for line in data: log.debug("Raw data: %s", line) payload.append(self.generate_payload(str(line.decode('ASCII', 'ignore')))) if len(payload) > 0: self.__output.push(payload) else: continue else: # a block of data: either string to be parsed or dict self.__current_data = data log.debug("Raw data: %s", data) if isinstance(data, str): payload = self.generate_payload(str(data.decode('ASCII', 'ignore'))) else: payload = self.generate_payload(data) self.__output.push(payload) except: self.__current_data = data log.exception('Unable to get or parse data. data: %s', data) error_count += 1 if self.__max_error_count > 0 and error_count >= self.__max_error_count: self.__running = False self.__error_count = error_count break self.__number_failed += no_msgs if self.__config['input']['type'] == 'tailer': self.__input.fail(**args) else: error_count = 0 self.__number_collected += no_msgs if self.__config['input']['type'] == 'tailer': self.__input.success(**args) self.__error_count = error_count else: time.sleep(1) if self.__schedule is None: count += 1 self.__sleep_count = count self.__output.close() log.info("Collector %s has stopped.", self.__collector_name) def generate_payload(self, data): """Parse raw data and package the result in required format""" if self.__parser: data = self.__parser.parse(data) log.debug("Parser %s parsed data %s: ", self.__parser.__class__.__name__, data) payload = {"id": str(uuid.uuid4()), "session": self.__session_id} payload['data'] = data if 'metadata' in self.__config: for m in self.__config['metadata']: payload[m] = self.__config['metadata'][m] log.debug("payload to push: %s", payload) return payload
class Collector(object): """Collect from an input, process it and push by an output""" def __init__(self, configuration): self.config = configuration self.input = None self.parser = None self.metadata = None # Input is required arguments = {} if 'arguments' in self.config['input']: arguments = self.config['input']['arguments'] if self.config['input']['type'] == 'command': self.input = CommandRunner(**arguments) elif self.config['input']['type'] == 'file': self.input = FileReader(**arguments) elif self.config['input']['type'] == 'http': self.input = HTTPReader(**arguments) elif self.config['input']['type'] == 'class': self.input = init_object(self.config['input']['name'], **arguments) elif self.config['input']['type'] == 'tailer': if 'tailer' in config: self.input = Tailer(config['tailer']) else: raise AttributeError( "Missing tailer in config file for tailer type input") assert self.input # parser is optional for parsing data collected by input if 'parser' in self.config: arguments = {} if 'arguments' in self.config['parser']: arguments = self.config['parser']['arguments'] if self.config['parser']['type'] == 'match': self.parser = MatchParser( self.config['parser']['pattern'].strip(), self.config['parser']['transform'].strip()) elif self.config['parser']['type'] == 'split': self.parser = SplitParser( self.config['parser']['delimiter'].strip(), self.config['parser']['transform'].strip()) elif self.config['parser']['type'] == 'dummy': self.parser = DummyParser() elif self.config['parser']['type'] == 'json': self.parser = JsonGrepParser(**arguments) elif self.config['parser']['type'] == 'class': self.parser = init_object(self.config['parser']['name'], **arguments) self._max_error_count = self.config['input'].get('max_error_count', -1) self._current_data = None self._number_collected = 0 self._number_failed = 0 self._error_count = 0 self._output = create_output(config['output']) if 'metadata' in self.config: self.metadata = self.config['metadata'] def collect(self): """Collect data and output to target""" error_count = 0 args = {'config': self.config['input']} log.debug("Starting to collect data.") data = None no_msgs = 1 try: data = self.input.get_data(**args) if isinstance(data, collections.deque) or isinstance(data, list): self._current_data = [ l.decode('ASCII', 'ignore') for l in data ] payload = [] no_msgs = len(data) for line in data: log.debug("Raw data: %s", line) payload.append( self.generate_package( str(line.decode('ASCII', 'ignore')))) if len(payload) > 0: self._output.push(payload) else: # a block of data: either string to be parsed or dict self._current_data = data log.debug("Raw data: %s", data) if isinstance(data, str): payload = self.generate_package( str(data.decode('ASCII', 'ignore'))) else: payload = self.generate_package(data) self._output.push(payload) except: self._current_data = data log.exception('Unable to get or parse data. data: %s', data) error_count += 1 if self._max_error_count > 0 and error_count >= self._max_error_count: self._error_count = error_count self._number_failed += no_msgs if self.config['input']['type'] == 'tailer': self.input.fail(**args) else: error_count = 0 self._number_collected += no_msgs if self.config['input']['type'] == 'tailer': self.input.success(**args) self._error_count = error_count self._output.close() def generate_package(self, data): """Parse raw data and package the result in required format""" if self.parser: data = self.parser.parse(data) log.debug("Parser %s parsed data %s: ", self.parser.__class__.__name__, data) log.debug("Data to be packaged: %s", data) return generate_payload(data, self.metadata)
class Collector(threading.Thread): def __init__(self, collector_name, config, output, tailer=None): threading.Thread.__init__(self, name=collector_name) self.__collector_name=collector_name self.__config=config self.__sleep_time=self.__config['input'].get('frequency',10) self.__cron=self.__config['input'].get('schedule',None) self.__schedule=None if self.__cron is not None: self.__schedule=CronEvent(self.__cron) log.debug("job scheduled at %s"%self.__schedule.numerical_tab) self.__input=None self.__parser=None self.__output=output if self.__config['input']['type']=='command': self.__input=CommandRunner(self.__config['input']['source']) elif self.__config['input']['type']=='file': self.__input=FileReader(self.__config['input']['path']) elif self.__config['input']['type']=='http': #log.debug('input %s'%self.__config['input']) url=self.__config['input']['url'] headers=self.__config['input'].get('headers', {}) #log.debug('headers %s'%headers) auth=self.__config['input'].get('auth', None) self.__input=HTTPReader(url, headers, auth) elif self.__config['input']['type']=='class': arguments={} if 'arguments' in self.__config['input']: arguments=self.__config['input']['arguments'] self.__input=init_object(self.__config['input']['name'], **arguments) elif self.__config['input']['type']=='tailer': if tailer is None: raise AttributeError("Missing tailer in config file for tailer type input") self.__input=tailer print(self.__input) assert(self.__input) if 'parser' in self.__config: if self.__config['parser']['type']=='match': self.__parser=MatchParser(self.__config['parser']['pattern'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='split': self.__parser=SplitParser(self.__config['parser']['delimiter'].strip(), self.__config['parser']['transform'].strip()) elif self.__config['parser']['type']=='dummy': self.__parser=DummyParser() elif self.__config['parser']['type']=='json': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=JsonGrepParser(**arguments) elif self.__config['parser']['type']=='class': arguments={} if 'arguments' in self.__config['parser']: arguments=self.__config['parser']['arguments'] self.__parser=init_object(self.__config['parser']['name'], **arguments) self.__running=True self.__session_id=str(uuid.uuid4()) self.__max_error_count=self.__config['input'].get('max_error_count', -1) self.__current_data=None self.__number_collected=0 self.__number_failed=0 self.__sleep_count=0 self.__error_count=0 self.__last_check_minute=-1 def quit(self): self.__running=False def info(self): col_info={"name":self.__collector_name, "config":self.__config, "sleep_time": self.__sleep_time} col_info["session_id"]=self.__session_id col_info["is_running"]=self.__running col_info["current_data"]=self.__current_data col_info["number_collected"]=self.__number_collected col_info["number_failed"]=self.__number_failed col_info["sleep_count"]=self.__sleep_count col_info["error_count"]=self.__error_count col_info["max_error_count"]=self.__max_error_count if self.__cron is not None: col_info["cron"]=self.__cron if self.__config['input']['type']=='tailer': col_info["tailer"]=self.__input.info(self.__config['input']['path']) return col_info def match_time(self): """Return True if this event should trigger at the specified datetime""" if self.__schedule is None: return False t=datetime.datetime.now() if t.minute==self.__last_check_minute: return False self.__last_check_minute=t.minute log.debug("check if cron job can be triggered. %d"%self.__last_check_minute) return self.__schedule.check_trigger((t.year,t.month,t.day,t.hour,t.minute)) def run(self): count=self.__sleep_time error_count=0 log.info("Collector %s has started."%self.__collector_name) while self.__running: args={'config': self.__config['input']} if (self.__schedule is None and count==self.__sleep_time) or self.match_time(): log.debug("Starting to collect data.") count = 0 data = None no_msgs = 1 try: data = self.__input.get_data(**args) if isinstance(data, collections.deque) or isinstance(data, list): self.__current_data=[l.decode('ASCII','ignore') for l in data] payload=[] no_msgs=len(data) for line in data: log.debug("raw data %s"%line) payload.append(self.generate_payload(str(line.decode('ASCII','ignore')))) if len(payload)>0: self.__output.push(payload) else: continue else: self.__current_data = data log.debug("Raw data %s" % data) payload = self.generate_payload(str(data.decode('ASCII','ignore'))) self.__output.push(payload) except: self.__current_data = data log.exception('Unable to get or parse data. data: %s' % data) error_count += 1 if self.__max_error_count > 0 and error_count >= self.__max_error_count: self.__running = False self.__error_count == error_count break self.__number_failed += no_msgs if self.__config['input']['type'] == 'tailer': self.__input.fail(**args) else: error_count=0 self.__number_collected+=no_msgs if self.__config['input']['type']=='tailer': self.__input.success(**args) self.__error_count==error_count else: time.sleep(1) if self.__schedule is None: count += 1 self.__sleep_count = count self.__output.close() log.info("Collector %s has stopped." % self.__collector_name) def generate_payload(self, data): """Parse raw data and package the result in required format""" if self.__parser: data = self.__parser.parse(data) log.debug("parsed data %s", data) payload = {"id": str(uuid.uuid4()), "session": self.__session_id} payload['data'] = data if 'metadata' in self.__config: for m in self.__config['metadata']: payload[m] = self.__config['metadata'][m] log.debug("payload to push: %s", payload) return payload