def test_multiple_pats(): text = 'gary 25 "never quit"' pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' m = grok_match(text, pat) assert m['name'] == 'gary' and m['age'] == '25' and m['motto'] == '"never quit"', \ 'grok match failed:%s, %s' % (text, pat, ) #variable names are not set text = 'gary 25 "never quit"' pat = '%{WORD} %{INT} %{QUOTEDSTRING}' m = grok_match(text, pat) assert m == {}, 'grok match failed:%s, %s' % (text, pat, ) #"male" is not INT text = 'gary male "never quit"' pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' m = grok_match(text, pat) assert m is None, 'grok match failed:%s, %s' % (text, pat, ) #nginx log text = 'edge.v.iask.com.edge.sinastorage.com 14.18.243.65 6.032s - [21/Jul/2014:16:00:02 +0800]' \ + ' "GET /edge.v.iask.com/125880034.hlv HTTP/1.0" 200 70528990 "-"' \ + ' "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \ + ' Chrome/36.0.1985.125 Safari/537.36"' pat = '%{HOST:host} %{IP:client_ip} %{NUMBER:delay}s - \[%{DATA:time_stamp}\]' \ + ' "%{WORD:verb} %{URIPATHPARAM:uri_path} HTTP/%{NUMBER:http_ver}" %{INT:http_status} %{INT:bytes} %{QS}' \ + ' %{QS:client}' m = grok_match(text, pat) assert m['host'] == 'edge.v.iask.com.edge.sinastorage.com' and m['client_ip'] == '14.18.243.65' \ and m['delay'] == '6.032' and m['time_stamp'] == '21/Jul/2014:16:00:02 +0800' and m['verb'] == 'GET' \ and m['uri_path'] == '/edge.v.iask.com/125880034.hlv' and m['http_ver'] == '1.0' \ and m['http_status'] == '200' and m['bytes'] == '70528990' \ and m['client'] == '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \ + ' Chrome/36.0.1985.125 Safari/537.36"', 'grok match failed:%s, %s' % (text, pat, )
def on_message(client, userdata, msg): global grok_pattern print pygrok.grok_match(msg.payload, grok_pattern, custom_patterns_dir = pats_dir) global msgCount msgCount = msgCount + 1 if msgCount > 10: client.disconnect()
def test_multiple_pats(): text = 'gary 25 "never quit"' pat = "%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" m = grok_match(text, pat) assert m["name"] == "gary" and m["age"] == "25" and m["motto"] == '"never quit"', "grok match failed:%s, %s" % ( text, pat, ) # variable names are not set text = 'gary 25 "never quit"' pat = "%{WORD} %{INT} %{QUOTEDSTRING}" m = grok_match(text, pat) assert m == {}, "grok match failed:%s, %s" % (text, pat) # "male" is not INT text = 'gary male "never quit"' pat = "%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" m = grok_match(text, pat) assert m is None, "grok match failed:%s, %s" % (text, pat) # nginx log text = ( "edge.v.iask.com.edge.sinastorage.com 14.18.243.65 6.032s - [21/Jul/2014:16:00:02 +0800]" + ' "GET /edge.v.iask.com/125880034.hlv HTTP/1.0" 200 70528990 "-"' + ' "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/36.0.1985.125 Safari/537.36"' ) pat = ( "%{HOSTNAME:host} %{IP:client_ip} %{NUMBER:delay}s - \[%{DATA:time_stamp}\]" + ' "%{WORD:verb} %{URIPATHPARAM:uri_path} HTTP/%{NUMBER:http_ver}" %{INT:http_status} %{INT:bytes} %{QS}' + " %{QS:client}" ) m = grok_match(text, pat) assert ( m["host"] == "edge.v.iask.com.edge.sinastorage.com" and m["client_ip"] == "14.18.243.65" and m["delay"] == "6.032" and m["time_stamp"] == "21/Jul/2014:16:00:02 +0800" and m["verb"] == "GET" and m["uri_path"] == "/edge.v.iask.com/125880034.hlv" and m["http_ver"] == "1.0" and m["http_status"] == "200" and m["bytes"] == "70528990" and m["client"] == '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/36.0.1985.125 Safari/537.36"' ), "grok match failed:%s, %s" % (text, pat)
def test_custom_pats(): custom_pats = {'ID' : '%{WORD}-%{INT}'} text = 'Beijing-1104,gary 25 "never quit"' pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' m = grok_match(text, pat, custom_patterns = custom_pats) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def parse(): # TODO: do this properly, using the xml module. # Write header sys.stderr.write('''<?xml version="1.0" encoding="UTF-8"?>\n''') sys.stderr.write('''<results version="2">\n''') sys.stderr.write(''' <cppcheck version=""/>\n''') sys.stderr.write(''' <errors>\n''') pattern="%{DATA:fname}:%{INT:lineno}: %{GREEDYDATA:rawmsg} \[%{DATA:label}\] \[%{INT:severity}" for l in sys.stdin.readlines(): m = pygrok.grok_match(l.strip(), pattern) if not m: continue if len(m.keys()) != 5: continue # Protect Jenkins from bad XML, which makes it barf msg = xml.sax.saxutils.quoteattr(m['rawmsg']) severity = cpplint_score_to_cppcheck_severity(int(m['severity'])) sys.stderr.write(''' <error id="%s" severity="%s" msg=%s>\n'''%(m['label'], severity, msg)) sys.stderr.write(''' <location file="%s" line="%s"/>\n'''%(m['fname'], m['lineno'])) sys.stderr.write(''' </error>\n''') # Write footer sys.stderr.write(''' </errors>\n''') sys.stderr.write('''</results>\n''')
def parse_log(): import json from pygrok import grok_match form = request.get_data() app.logger.warning( "form:%s", form ) form = json.loads( form ) grok_pat = build_grok_pattern( form[ 'log_format' ] ) app.logger.warning( 'grok_pat:%s', grok_pat ) logs = form[ 'log_examples' ].split( "\n" ) pls = [] for l in logs: pl = grok_match( l, grok_pat ) pls.append( pl ) app.logger.warning('pl:%s', json.dumps( pl, indent=4 ) ) data = { 'parsed_logs': pls } ls_conf = build_logstash_conf( form[ 'log_format' ] ) app.logger.warning( 'ls_conf:%s', ls_conf ) return json.dumps( data )
def test_custom_pats(): custom_pats = {"ID": "%{WORD}-%{INT}"} text = 'Beijing-1104,gary 25 "never quit"' pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" m = grok_match(text, pat, custom_patterns=custom_pats) assert ( m["user_id"] == "Beijing-1104" and m["name"] == "gary" and m["age"] == "25" and m["motto"] == '"never quit"' ), "grok match failed:%s, %s" % (text, pat)
def test_custom_pat_files(): pats_dir = './test_patterns' text = 'Beijing-1104,gary 25 "never quit"' #pattern "ID" is defined in ./test_patterns/pats pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' m = grok_match(text, pat, custom_patterns_dir=pats_dir) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def test_custom_pat_files(): pats_dir = './test_patterns' text = 'Beijing-1104,gary 25 "never quit"' #pattern "ID" is defined in ./test_patterns/pats pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' m = grok_match(text, pat, custom_patterns_dir = pats_dir) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def read(self, stream): for line in stream.readlines(): matches = pygrok.grok_match(line, self.pattern) if matches is not None: yield Point(**matches) else: logger.warn('not matchined %s' % line)
def test_one_pat(): text = '1024' pat = '%{INT:test_int}' m = grok_match(text, pat) assert m['test_int'] == '1024', 'grok match failed:%s, %s' % (text, pat, ) text = '1024' pat = '%{NUMBER:test_num}' m = grok_match(text, pat) assert m['test_num'] == '1024', 'grok match failed:%s, %s' % (text, pat, ) text = 'garyelephant ' pat = '%{WORD:name} ' m = grok_match(text, pat) assert m['name'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = '192.168.1.1' pat = '%{IP:ip}' m = grok_match(text, pat) assert m['ip'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = 'github.com' pat = '%{HOSTNAME:website}' m = grok_match(text, pat) assert m['website'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = '1989-11-04 05:33:02+0800' pat = '%{TIMESTAMP_ISO8601:ts}' m = grok_match(text, pat) assert m['ts'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = 'github' pat = '%{WORD}' m = grok_match(text, pat) assert m == {}, 'grok match failed:%s, %s' % (text, pat, ) #you get nothing because variable name is not set, compare "%{WORD}" and "%{WORD:variable_name}" text = 'github' pat = '%{NUMBER:test_num}' m = grok_match(text, pat) assert m is None, 'grok match failed:%s, %s' % (text, pat, ) #not match text = '1989' pat = '%{NUMBER:birthyear:int}' m = grok_match(text, pat) assert m == {'birthyear': 1989}, 'grok match failed:%s, %s' % (text, pat, )
def test_custom_pat_files(): import os.path pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_patterns") text = 'Beijing-1104,gary 25 "never quit"' # pattern "ID" is defined in ./test_patterns/pats pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" m = grok_match(text, pat, custom_patterns_dir=pats_dir) assert ( m["user_id"] == "Beijing-1104" and m["name"] == "gary" and m["age"] == "25" and m["motto"] == '"never quit"' ), "grok match failed:%s, %s" % (text, pat)
def grok_parser(pattern, lines, cb_oups=None, date_key=None, date_format=None): for line in lines: m = grok_match(line, pattern) if m is None: if cb_oups is not None: cb_oups(line) continue if None not in {date_key, date_format}: ts = datetime.strptime(m[date_key], date_format) m['timestamp'] = ts yield m
def __call__(self, env, start_response): req = Request(env) resp = req.get_response(self.app) try: (version, account, container, objname) = split_path(req.path_info, 1, 4, True) except ValueError: return resp(env, start_response) is_grok_request = req.params.has_key( 'grok') or 'grok-pattern' in req.headers # grok request has to be explicit, and only expected for GET operations if not req.method == 'GET' or not is_grok_request: return resp(env, start_response) self.logger.debug('Calling grok middleware') # make sure we have an object to work on if not objname or not resp.status_int == 200: return resp(env, start_response) # the grok pattern is expected to be in the request headers # if the pattern is missing, we ignore the grok request pattern = req.headers.get('grok-pattern') if not pattern: self.logger.debug( 'Object found, but no pattern requested, aborting') return self.get_err_response('Grok pattern is missing')( env, start_response) self.logger.debug('Starting grok operation') # we are going to assume the retrieved object is string object # and iterate through lines of resp.body and execute grok_match grokked_content = '' try: strbuf = StringIO.StringIO(resp.body) for line in strbuf: parsed_line = pygrok.grok_match(line, pattern) grokked_content += json.dumps(parsed_line) + '\n' except Exception as e: return self.get_err_response(str(e))(env, start_response) resp.body = grokked_content return resp(env, start_response)
def parse_file(path): files = open(path) new_files = open(path.replace('_old',''), mode='w') for elem in files: fields= [] nginx_dict = pygrok.grok_match(elem, pattern['nginx']) for item in key_order: if item=='': fields.append('-') else: fields.append(nginx_dict[item]) fields = '\t'.join(fields)+'\n' new_files.write(fields) files.close() new_files.close() one = {'log_name': path, '__CREATE_TIME__':datetime.datetime.now(utc)} finished_nginx_log.insert_one(one)
def __call__(self, env, start_response): req = Request(env) resp = req.get_response(self.app) try: (version, account, container, objname) = split_path(req.path_info, 1, 4, True) except ValueError: return resp(env, start_response) is_grok_request = req.params.has_key("grok") or "grok-pattern" in req.headers # grok request has to be explicit, and only expected for GET operations if not req.method == "GET" or not is_grok_request: return resp(env, start_response) self.logger.debug("Calling grok middleware") # make sure we have an object to work on if not objname or not resp.status_int == 200: return resp(env, start_response) # the grok pattern is expected to be in the request headers # if the pattern is missing, we ignore the grok request pattern = req.headers.get("grok-pattern") if not pattern: self.logger.debug("Object found, but no pattern requested, aborting") return self.get_err_response("Grok pattern is missing")(env, start_response) self.logger.debug("Starting grok operation") # we are going to assume the retrieved object is string object # and iterate through lines of resp.body and execute grok_match grokked_content = "" try: strbuf = StringIO.StringIO(resp.body) for line in strbuf: parsed_line = pygrok.grok_match(line, pattern) grokked_content += json.dumps(parsed_line) + "\n" except Exception as e: return self.get_err_response(str(e))(env, start_response) resp.body = grokked_content return resp(env, start_response)
def test_one_pat(): text = "1024" pat = "%{INT:test_int}" m = grok_match(text, pat) assert m["test_int"] == "1024", "grok match failed:%s, %s" % (text, pat) text = "1024" pat = "%{NUMBER:test_num}" m = grok_match(text, pat) assert m["test_num"] == "1024", "grok match failed:%s, %s" % (text, pat) text = "garyelephant " pat = "%{WORD:name} " m = grok_match(text, pat) assert m["name"] == text.strip(), "grok match failed:%s, %s" % (text, pat) text = "192.168.1.1" pat = "%{IP:ip}" m = grok_match(text, pat) assert m["ip"] == text.strip(), "grok match failed:%s, %s" % (text, pat) text = "github.com" pat = "%{HOSTNAME:website}" m = grok_match(text, pat) assert m["website"] == text.strip(), "grok match failed:%s, %s" % (text, pat) text = "1989-11-04 05:33:02+0800" pat = "%{TIMESTAMP_ISO8601:ts}" m = grok_match(text, pat) assert m["ts"] == text.strip(), "grok match failed:%s, %s" % (text, pat) text = "github" pat = "%{WORD}" m = grok_match(text, pat) assert m == {}, "grok match failed:%s, %s" % (text, pat) # you get nothing because variable name is not set, compare "%{WORD}" and "%{WORD:variable_name}" text = "github" pat = "%{NUMBER:test_num}" m = grok_match(text, pat) assert m is None, "grok match failed:%s, %s" % (text, pat)
def check_sample_pattern(self,logfile,pname,pattern): #logging.info('check sample {0}'.format(logfile)) line_count=0 match_count=0 with open(logfile,'r') as lf: for line in lf.read().splitlines(): m=pygrok.grok_match(line,pattern,custom_patterns_dir=self.custpattern_dir) line_count+=1 if m is not None: match_count+=1 if pname=='@': print ' {0}) {2}{1}{3}'.format(line_count,line,COLOR_WHITE,COLOR_RESET) print ' => {0}'.format(m) else: print ' {0}) {2}{1}{3}'.format(line_count,line,COLOR_RED,COLOR_RESET) match_percent=match_count*100/line_count if match_percent==100: logging.info('{1} : match percent {0} %'.format(match_percent,logfile)) elif match_percent>90: logging.warning('{1} : match percent {0} %'.format(match_percent,logfile)) else: logging.error('{1} : match percent {0} %'.format(match_percent,logfile))
def parse_single_line( log_format, log ): """return None if fail to parse log""" from grok.grok import build_grok_pattern from string import Template # add beginning(^) and ending($) regex character t = Template( '^$pattern$$' ) grok_pat = build_grok_pattern( log_format ) print grok_pat parsed_log = grok_match( log, t.substitute( pattern=grok_pat ) ) if parsed_log is None: return None for pos, format in log_format.items(): if format[ 'type' ] != 'field': continue field_name = format[ 'name' ] field_type = format[ 'field_type' ].lower() if field_type == 'integer/long': # convert numbers if ( isinstance( parsed_log[ field_name ], int ) is not True ) \ or ( isinstance( parsed_log[ field_name ], long ) is not True ): try: v = int( parsed_log[ field_name ] ) except ValueError as e: return None parsed_log[ field_name ] = v elif field_type == 'float/double': # convert numbers if isinstance( parsed_log[ field_name ], float ) is not True: try: v = float( parsed_log[ field_name ] ) except ValueError as e: return None parsed_log[ field_name ] = v elif field_type == 'IP' and format[ 'to_geoip' ] is True: # ip to city and isp info geoip_f = field_name + '#' try: prj_dir = os.path.dirname( os.path.dirname( os.path.abspath( __file__ ) ) ) mmdb = os.path.join( prj_dir, 'dependencies', 'geoip2', 'world_city_geoip2.mmdb' ) reader = geoip2.database.Reader( mmdb, locales=['en'] ) try: response = reader.city( parsed_log[ field_name ] ) v = { 'country': response.country.name, 'region': response.subdivisions.most_specific.name, # 省 'city': response.city.name, 'isp': response.postal.code, } except geoip2.errors.AddressNotFoundError as e: v = { 'country': 'Not Found', 'region': 'Not Found', 'city': 'Not Found', 'isp': 'Not Found' } reader.close() except IOError as e: raise IPDBNotFoundError() parsed_log[ geoip_f ] = v elif field_type == 'json': # load json from string json_f = field_name + '_json' parsed_log[ json_f ] = json.loads( parsed_log[ field_name ] ) elif field_type == 'custom_date': # parse date in custom format pass elif field_type == 'httpdate': # check whether the value of this field is httpdate or not # 不需要判断,在前面已判断 pass elif field_type == 'iso8601': pass return parsed_log