def process(self, data_to_process: str) -> Dict[str, Any]: """Transfrom a string in IIS logging style sample input: 2019-09-11 02:02:45 WEBSITE-LIDL-ACCOUNT-PROD-WE GET / X-ARR-LOG-ID=b7fea7b7-7913-43a9-87a6-59bcca58e4b7 443 - 51.105.161.182 - - - website-lidl-account-prod-we.lidl-account-ase-prod-we.p.azurewebsites.net 200 0 0 6180 1065 15 Uses grok pattern to parse the string. Args: data_to_process: str containing the IIS log. """ pattern = "%{TIMESTAMP_ISO8601:log_timestamp} %{NOTSPACE:sitename} %{WORD:cs_method} %{URIPATH:cs_uri_stem} %{NOTSPACE:cs_uri_query} %{NUMBER:s_port} %{NOTSPACE:cs_username} %{IPORHOST:c_ip} %{NOTSPACE:cs_useragent} %{NOTSPACE:cs_cookie} %{NOTSPACE:cs_referer} %{IPORHOST:cs_host} %{NUMBER:sc_status} %{NUMBER:sc_substatus} %{NUMBER:sc_win32_status} %{NUMBER:sc_bytes} %{NUMBER:cs_bytes} %{NUMBER:time_taken}" logger.debug(f"Parsing: {data_to_process}") grok = Grok(pattern) parsed_data = grok.match(data_to_process) dict_for_db = { "time": parsed_data["log_timestamp"], "status_code": parsed_data["sc_status"], "outbound_data": parsed_data["sc_bytes"], "inbound_data": parsed_data["cs_bytes"], "time_taken": parsed_data["time_taken"], "raw_data": json.dumps(parsed_data), } return dict_for_db
def test_match_unnamed(): url = "https://*****:*****@test.com/path?query=1" grok = Grok("%{URI}", match_unnamed_groks=True) m = grok.match(url) assert m["USER"] == "foo" assert m["URI"] == url assert m["HOSTNAME"] == "test.com"
def extract_derived_fields(self, doc_fields): """ Extract derived field based on a document :param doc_fields: document information used to extract derived fields :return: all derived fields """ derived_fields = dict() for field_name, grok_pattern in self.settings.list_derived_fields: try: # If key doesn't exist, an exception is raise doc_value = helpers.utils.get_dotkey_value( doc_fields, field_name, case_sensitive=False) if grok_pattern in self.grok_filters.keys(): grok = self.grok_filters[grok_pattern] else: grok = Grok(grok_pattern) self.grok_filters[grok_pattern] = grok match_dict = grok.match(doc_value) if match_dict: for match_dict_k, match_dict_v in match_dict.items(): derived_fields[match_dict_k] = match_dict_v except KeyError: pass # Ignore, value not found... return derived_fields
def make_parser(file: dict, groks_dir: str) -> Parser: """ Builds a parser from a config file entry and a directory of extra grok expressions. :param file: A single `file` element from the array within the config file. :param groks_dir: A directory to find additional grok patterns :return: A parser for the specified configuration """ if not all(["type", "strptime", "path"] for key in file): raise ValueError( f"File entry requires 'type', 'strptime' and 'path' keys. Found:\n{file}" ) type: str = file["type"] strptime_pattern: str = file["strptime"] groks = [ Grok(grok, custom_patterns_dir=groks_dir) for grok in file["path"] ] if "grok" in file: grok_name = file["grok"] else: # Guess grok from type name... # Grok patterns don't support hyphenation grok_name = "%%{%s}" % type.upper().replace("-", "") return Parser( type, Grok(grok_name, custom_patterns_dir=groks_dir), groks, strptime_pattern, )
def date_finder(text): date ="" date_pattern = '%{YEAR:year}-%{MONTHNUM:month}-%{MONTHDAY:day}' matches = list(datefinder.find_dates(s)) match_date = re.search('\d{4}-\d{2}-\d{2}', s) try: print "====using dateutil" for i in s.splitlines(): d = parser.parse(i) print(d.strftime("%Y-%m-%d")) except Exception as e: print e try: print "====pygrok===" grok = Grok(date_pattern) print(grok.match(s)) except Exception as e: print e try: print "====using date===" if len(matches) > 0: date = matches[0] print date else: print 'No dates found' except Exception as e: print e try: print "====using date===" date = datetime.datetime.strptime(match_date.group(), '%Y-%m-%d').date() print date except Exception as e: print e try: print "====using Chunkgrams===" chunkGram = r"""NE:{<NNP>+<CD>}""" chunkParser = nltk.RegexpParser(chunkGram) sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence.strip()) for sentence in sentences] tagged_sentences = [nltk.pos_tag(i) for i in tokenized_sentences] chunked_sentences = [chunkParser.parse(i) for i in tagged_sentences] entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) print entity_names except Exception as e: print e try: print "===using pydatum==" datum = Datum() print (datum.from_iso_date_string(text)) except Exception as e: print e try: print "===using dateparser==" date = search_dates(text.decode('ascii','ignore')) print date except Exception as e: print e
def parse(log): grok = Grok( "<%{WORD:type}:%{WORD:severity}>%{TIMESTAMP_ISO8601:timestamp}%{SPACE}%{IP:ip}:%{IPORHOST:host}%{SPACE}%{GREEDYDATA:data}" ) l = grok.match(log) l["data"] = b64decode(bytes.fromhex(l["data"])).decode() return l
def calc_api_stat_daily(logfile): with open(logfile, 'r') as fin: lines = fin.readlines() pattern = '%{IPORHOST:clientip} - - \[%{HTTPDATE:timestamp}\] "%{WORD:verb} %{URIPATHPARAM:request} HTTP/%{NUMBER:httpversion}" %{NUMBER:response} (?:%{NUMBER:bytes}|-) (?:"(?:%{URI:referrer}|-)"|%{QS:referrer}) %{QS:agent} %{QS:xforwardedfor} %{BASE10NUM:request_duration}' grok = Grok(pattern) api_stat_dic = {} for line in lines: res = grok.match(line) if res is None: continue api, duration = res['request'].split('?')[0], float( res['request_duration']) * 1000 api = merge_api(api) val = api_stat_dic.get(api, None) if val is None: api_stat_dic[api] = { 'cnt': 1, 'avg': duration, 'max': duration, 'min': duration } else: val['max'] = max(val['max'], duration) val['min'] = min(val['min'], duration) val['avg'] = (val['avg'] * val['cnt'] + duration) / (val['cnt'] + 1) val['cnt'] += 1 #pprint.pprint(api_stat_dic) for k, v in api_stat_dic.items(): print('{}\t{}\t{}\t{}\t{}'.format(k, v['cnt'], round(v['avg'], 0), v['max'], v['min']))
def parse_base(syslog): pattern = "<%{NUMBER:pri:int}>(?<logTime>(%{MONTH} +%{MONTHDAY} %{TIME}( %{YEAR})?|%{MONTH} +%{MONTHDAY} %{YEAR} %{TIME})) %{DATA:loghostname} %%%{NUMBER}%{DATA:module}/%{NUMBER:severity:int}/%{DATA:logTypeDesc}:(( -%{DATA:location};)?|(s)?) +%{GREEDYDATA:desc}" grok = Grok(pattern) raw_log = syslog["message"] parsed_log = grok.match(raw_log) if parsed_log: # return parsed_log syslog.update(parsed_log)
def test_custom_pats(): custom_pats = {'ID': '%{WORD}-%{INT}'} text = 'Beijing-1104,gary 25 "never quit"' pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' grok = Grok(pat, custom_patterns=custom_pats) m = grok.match(text) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def do_sync(self): """Read data out of text files and write it to stdout following Singer spec""" LOGGER.info("Extracting data") # read/persist the file in batches RECORDS_PER_BATCH = 100 for dirpath, d in sorted(self.directories.items()): dirname = d['dirname'] key_properties = ['_singer_gen_key'] if self.rec_hash_keys else [] LOGGER.info('Writing schema for `{}`'.format(dirname)) singer.write_schema(dirname, d['schema'], key_properties) if len(d['files']) > 0: LOGGER.info('Extracting data from `{}`'.format(dirname)) lines_to_write = [] for f in sorted(d['files'], key=lambda x: x['filename']): if self.file_format == 'jsonl': for line in open(f['absolute_path'], 'r'): parsed_line = json.loads(line) parsed_line = self._add_key_to_rec( parsed_line, line) lines_to_write.append(parsed_line) if len(lines_to_write) >= RECORDS_PER_BATCH: singer.write_records(dirname, lines_to_write) lines_to_write = [] elif self.file_format == 'csv': for df in pd.read_csv(f['absolute_path'], parse_dates=False, chunksize=1): rec = df.to_dict('records')[0] rec = self._add_key_to_rec(rec) lines_to_write.append(rec) if len(lines_to_write) >= RECORDS_PER_BATCH: singer.write_records(dirname, lines_to_write) lines_to_write = [] elif self.file_format == 'log': # TODO Use pattern per table and get it not from config grok = Grok(CONFIG['grok_pattern']) for line in open(f['absolute_path'], 'r'): parsed_line = grok.match(line) if not parsed_line: parsed_line = {} parsed_line['_sdc_raw_log_line'] = line lines_to_write.append(parsed_line) if len(lines_to_write) >= RECORDS_PER_BATCH: singer.write_records(dirname, lines_to_write) lines_to_write = [] singer.write_records(d['dirname'], lines_to_write) lines_to_write = [] singer.write_records(d['dirname'], lines_to_write) LOGGER.info('Writing state for `{}`'.format(dirname)) singer.write_state(self.state) LOGGER.info('Writing final state') singer.write_state(self.state)
def test_predefined_patterns(): grok = Grok("%{DATA}") errors = [] for pattern in grok.predefined_patterns: try: g = Grok("%{" + pattern + "}") except Exception as e: errors.append((pattern, str(e))) assert errors == []
def build_schemas(self): """Do a pass over the files and use GenSon to generate their schemas""" # TODO add sampling so that we don't have to pass over every single record LOGGER.info('Building schemas') if not self.state.get('schemas'): self.state['schemas'] = {} for dirpath, d in self.directories.items(): dirname = d['dirname'] LOGGER.info('Building schema for `{}`'.format(dirname)) schema_builder = SchemaBuilder() if not self.state['schemas'].get(dirname): self.state['schemas'][dirname] = { "type": "object", "properties": {} } else: LOGGER.info( "Existing schema for `{}` will be used as seed schema". format(dirname)) schema_builder.add_schema(self.state['schemas'][dirname]) for f in d['files']: if self.file_format == 'jsonl': for line in open(f['absolute_path'], 'r'): parsed_line = json.loads(line) parsed_line = self._add_key_to_rec(parsed_line, line) schema_builder.add_object(parsed_line) elif self.file_format == 'csv': # Note: parsing dates is pointless until date formatting support in GenSon for df in pd.read_csv(f['absolute_path'], parse_dates=False, chunksize=1): rec = df.to_dict('records')[0] rec = self._add_key_to_rec(rec) schema_builder.add_object(rec) elif self.file_format == 'log': # TODO Use pattern per table and get it not from config grok = Grok(CONFIG['grok_pattern']) for line in open(f['absolute_path'], 'r'): parsed_line = grok.match(line) if not parsed_line: parsed_line = {} parsed_line['_sdc_raw_log_line'] = line schema_builder.add_object(parsed_line) self.directories[dirpath]['schema'] = schema_builder.to_schema() self.state['schemas'][dirname] = self.directories[dirpath][ 'schema'] LOGGER.info('Done building schemas')
def test_custom_pat_files(): import os.path pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_patterns') text = 'Beijing-1104,gary 25 "never quit"' # pattern "ID" is defined in ./test_patterns/pats pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' grok = Grok(pat, custom_patterns_dir=pats_dir) m = grok.match(text) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def parse_line(line): # pattern = '【%{NOTSPACE:type}】%{NOTSPACE:id},%{INT:month}月%{INT:date}日%{NOTSPACE:train_number}次%{INT:train_id}车%{INT:seat_id}\s*' pattern = '【%{NOTSPACE:type}】' type = Grok(pattern).match(line) if type["type"] == '12306': # pattern2 = "%{INT:month}月%{INT:day}日%{NOTSPACE:train_number}次%{INT:train_id}车%{INT:seat_id},%{NOTSPACE:site_name}站%{HOUR:hour}:%{MINUTE:minute}开,%{NOTSPACE:ticket_gate}。" pattern2 = "%{INT:month}月%{INT:day}日%{NOTSPACE:train_number}次%{NOTSPACE:seat_info},%{NOTSPACE:site_name}站%{HOUR:hour}:%{MINUTE:min}开%{NOTSPACE}口%{NOTSPACE:ticket_gate}。" info = Grok(pattern2).match(line) return info
def test_custom_pat_files(): import os.path pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_patterns') text = 'Beijing-1104,gary 25 "never quit"' #pattern "ID" is defined in ./test_patterns/pats pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' grok = Grok(pat, custom_patterns_dir = pats_dir) m = grok.match(text) assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \ and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
def test_struct_pats(): import json custom_pats = { "POWER": "(?:%{NUMBER}|(-inf))", "HEX": "(?:(0[x,X])*(?:[0-9a-zA-Z]+))", "HEXENUM": "%{HEX:v}[ ]?", "HEXLINE": "%{HEXENUM:col:arr}\n?" } text = json.dumps({"a": {"c": {"d": 4}, "e": [1, 2, 3]}}) custom_pats.update({"EA": "%{NUMBER:v:int}(,\s*)*"}) custom_pats.update({"E": "(.*): \[%{EA:e:arr}\]"}) grok = Grok("(.*): \[%{EA:e:arr}\]", custom_patterns=custom_pats, fullmatch=False) m = grok.match(text) assert m['e'][0]['v'] == 1, print(m['e'][0]['v']) assert m['e'][1]['v'] == 2, print(m['e'][1]['v']) assert m['e'][2]['v'] == 3, print(m['e'][2]['v']) custom_pats.update({"C": "{%{QS}: {%{QS}: %{NUMBER:d:int}}"}) grok = Grok("%{C:c}", custom_patterns=custom_pats, fullmatch=False) m = grok.match(text) assert m['c_d'] == 4, print(m['c_d']) custom_pats.update({"A": "{(.*): %{C:c}, (.*): \[%{EA:e:arr}\]}"}) grok = Grok("%{A:a}", custom_patterns=custom_pats, fullmatch=False) m = grok.match(text) assert m['a_c_d'] == 4, print(m['a_c_d']) assert m['a_e'][0]['v'] == 1, print(m['a_c_e'][0]['v']) assert m['a_e'][1]['v'] == 2, print(m['a_c_e'][1]['v']) assert m['a_e'][2]['v'] == 3, print(m['a_c_e'][2]['v'])
def scrape_rotations(): ##### Get the URL for each season's wiki page #### # Grab HTML soup = BeautifulSoup(requests.get(CHAMPION_ROTATION_ARCHIVE_URL).text, 'html.parser') # Access list of past champion rotations for each season htmlList = soup.find('div', class_='WikiaArticle').find_all('ul')[1].find_all('li') hrefList = [] for l in htmlList: href = str(l.find('a')['href']) season = str(href.split('/')[-1]) hrefList.append(WIKI_URL + href + "#" + "Pre-" + season) hrefList.append(WIKI_URL + href + "#" + season) #### Get champion and date information for each wiki #### week_number = 0 date_pattern = '%{MONTHDAY:day} %{MONTH:month} %{YEAR:year}' grok = Grok(date_pattern) for url in hrefList: # Grab HTML soup = BeautifulSoup(requests.get(url).text, 'html.parser') # Get the table for each week of the pre-season/season tables = soup.find_all('table', class_='wikitable') # Go through each week for the pre-season/season for table in tables: # Increment the week number week_number += 1 # Get the dates for that week dates = table.find('p').getText().replace(u'\xa0', ' ').replace(u'\n', ' ').split(" - ") start_date = to_date(grok.match(dates[0])) end_date = to_date(grok.match(dates[1])) # Rotation attributes dictionary rotation_attributes = { "week_number": week_number, "start_date": start_date, "end_date": end_date } rotation = Rotation(**rotation_attributes) # Get the list of champions for that week for champ in table.find_all("div", {"data-game": "lol"}): champion = Champion.query.filter(Champion.name == champ.find('a')['title']).first() if champion: rotation.champions.append(champion) # Add to DB session db.session.add(rotation) # Save to the DB db.session.commit()
def grok_misses_from_files(files, grok_pattern, custom_patterns={}, header_lines=0): grok = Grok(grok_pattern, custom_patterns=custom_patterns) pred = all_pass( [skip_lines(header_lines), lambda x: grok.match(x[1]) is None]) all_misses = [] for file in files: all_misses.append(filter_file_with_pred(file, pred)) return all_misses
def test_custom_pats(): custom_pats = {"ID": "%{WORD}-%{INT}"} text = 'Beijing-1104,gary 25 "never quit"' pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" grok = Grok(pat, custom_patterns=custom_pats) m = grok.match(text) assert (m["user_id"] == "Beijing-1104" and m["name"] == "gary" and m["age"] == "25" and m["motto"] == '"never quit"'), "grok match failed:%s, %s" % ( text, pat, )
def test_hotloading_pats(): text = 'github' pat = '%{WORD:test_word}' grok = Grok(pat) m = grok.match(text) assert m['test_word'] == 'github', 'grok match failed:%s, %s' % (text, pat, ) # matches text = '1989' pat = '%{NUMBER:birthyear:int}' grok.set_search_pattern(pat) m = grok.match(text) assert m == {'birthyear': 1989}, 'grok match failed:%s, %s' % (text, pat, )
def insert_log_data(self, data): print("Line ------ : {}".format(data)) for line in data: for phrase in self.keep_phrases: if phrase in line: pattern = '%{TIMESTAMP_ISO8601:timestamp}%{SPACE}(\[%{WORD:pid}%{SPACE}%{POSINT:pid}])%{SPACE}(\[%{NUMBER:responsetime}?ms])%{SPACE}(\[%{WORD:uid}\s+%{WORD:uidname}])%{SPACE}(\[%{LOGLEVEL:loglevel}])%{SPACE}(%{URIPATHPARAM:request})%{SPACE}%{GREEDYDATA:syslog_message}' grok = Grok(pattern) grok_json = grok.match(line) post_id = self.insert_data(grok_json) print("Grok json : {}".format(grok_json)) # post_id = self.collection.insert_one(grok_json).inserted_id print(post_id) self.format_single_doc(post_id, grok_json)
def get_saml_response(driver, debug=False, sleeptime=0.5): # Grok for parsing xml should redo with bsoup4 while True: time.sleep(sleeptime) for entry in driver.get_log('performance'): if 'samlresponse' in str(entry).lower() and '"documenturl":"https://signin.aws.amazon.com/saml"' in str( entry).lower(): if debug == True: print("Saml matches: " + str(entry)) from pygrok import Grok pattern = '%{GREEDYDATA}SAMLResponse=%{DATA:samlresponse}&%{GREEDYDATA}"%{GREEDYDATA}' grok = Grok(pattern) saml_resp_enc = grok.match(str(entry))['samlresponse'] saml_resp_dec = unquote(saml_resp_enc) return saml_resp_dec
def from_file_path(cls, path): pattern = '^resource "%{WORD:resource_type}" "%{DATA:resource_name}" {' grok = Grok(pattern) with open(path, "r") as f: lines = f.readlines() tf_resources = [] for line in lines: res = grok.match(line) if res is not None and "resource_type" in res and "resource_name" in res: tf_resources.append(TFResource(**res)) if len(tf_resources) > 0: return cls(tf_resources) return cls(tf_resources)
def test_custom_pat_files(): import os.path pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_patterns") text = 'Beijing-1104,gary 25 "never quit"' # pattern "ID" is defined in ./test_patterns/pats pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}" grok = Grok(pat, custom_patterns_dir=pats_dir) m = grok.match(text) assert (m["user_id"] == "Beijing-1104" and m["name"] == "gary" and m["age"] == "25" and m["motto"] == '"never quit"'), "grok match failed:%s, %s" % ( text, pat, )
def test_exim_received(): """Test exim incoming email""" log_entry = '2017-06-24 15:41:05 1dOqvZ-0002HB-Ob <= '\ '[email protected] H=in3f.electric.net '\ '[72.35.12.46]:47218 P=esmtps X=TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256 '\ 'CV=no S=6297 T="Millionaires, CEOs, Entrepreneurs are using brain pills '\ 'to boost intelligence." for [email protected]' patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \ '../patterns.d') grok_output = Grok('%{SYSLOG_EXIM}', \ custom_patterns_dir=patterns_directory).match(log_entry) assert grok_output['exim_timestamp'] == '2017-06-24 15:41:05' assert grok_output['exim_log_id'] == '1dOqvZ-0002HB-Ob' assert grok_output['exim_remote_host'] == 'in3f.electric.net' assert grok_output['exim_remote_ip'] == '72.35.12.46' assert grok_output[ 'exim_tls_cipher'] == 'TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256' assert grok_output['exim_tls_certificate_verified'] == 'no' assert grok_output[ 'exim_sender_address'] == '*****@*****.**' assert grok_output['exim_port'] == '47218' assert grok_output['exim_protocol'] == 'esmtps' assert grok_output['exim_message_size'] == '6297' assert grok_output['exim_subject'] == 'Millionaires, CEOs, Entrepreneurs '\ 'are using brain pills to boost intelligence.'
def extractAndStripDateFromStory(df): stories = list(df['story']) date_pattern = '%{MONTH:month} %{YEAR:year}' p = '' month = [] story = [] for s in stories: grok = Grok(date_pattern) d = grok.match(s) dt = p.join(d['month'] + " " + d['year'] + ".") m = d['month'] month.append(m) st = re.sub(dt, '', s) story.append(st) df['story'] = story df['Month'] = month
def extract_derived_fields(self, doc_fields): derived_fields = dict() for field_name, grok_pattern in self.settings.config.items("derivedfields"): if helpers.utils.dict_contains_dotkey(doc_fields, field_name, case_sensitive=False): if grok_pattern in self.grok_filters.keys(): grok = self.grok_filters[grok_pattern] else: grok = Grok(grok_pattern) self.grok_filters[grok_pattern] = grok match_dict = grok.match(helpers.utils.get_dotkey_value(doc_fields, field_name, case_sensitive=False)) if match_dict: for match_dict_k, match_dict_v in match_dict.items(): derived_fields[match_dict_k] = match_dict_v return derived_fields
def test_exim_smtp_outgoing(): """Test exim outgoing smtp""" log_entry = '2017-06-24 23:36:21 1dOyLV-000359-20 SMTP connection outbound '\ '1498361781 1dOyLV-000359-20 domain.com [email protected]' patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \ '../patterns.d') grok_output = Grok('%{SYSLOG_EXIM}', \ custom_patterns_dir=patterns_directory).match(log_entry) assert grok_output['exim_log_id'] == '1dOyLV-000359-20' assert grok_output['exim_sender'] == 'domain.com' assert grok_output['exim_external_recipient'] == '*****@*****.**' assert grok_output['exim_timestamp'] == '2017-06-24 23:36:21'
def test_arr_pats(): custom_pats = { "POWER": "(?:%{NUMBER}|(-inf))", "HEX": "(?:(0[x,X])*(?:[0-9a-zA-Z]+))", "HEXENUM": "%{HEX:v}[ ]?", "HEXLINE": "%{HEXENUM:col:arr}\n?" } text = '0x1 0x2 0x3' pat = '%{HEXLINE:row}' grok = Grok(pat, custom_patterns=custom_pats, fullmatch=False) m = grok.match(text) assert (m['row'] == '0x1 0x2 0x3' and m['row_col']['_str'] == '0x1 0x2 0x3' and m['row_col'][0]["v"] == '0x1' and m['row_col'][1]["v"] == '0x2' and m['row_col'][2]["v"] == '0x3'), 'grok match failed:%s, %s' % ( text, pat, ) # matches text = '0x1 0x2 0x3\n0x4 0x5 0x6\n0x7 0x8 0x9' pat = '%{HEXLINE:row:arr}' grok.set_search_pattern(pat) m = grok.match(text) for i in range(2): for j in range(3): assert m['row'][i]['col'][j]['v'] == "0x" + "%d" % ( i * 3 + j + 1), 'grok match failed:%s, %s' % ( text, pat, )
def inputa(): req_pattern = "%{WORD:method} %{URIPATHPARAM:url} %{WORD:version}" grok = Grok(req_pattern) f_n = 'access.log-2017-10-17-1508216401.log' f = open(f_n, "r") http = urllib3.PoolManager() while True: line = f.readline() if not line: break try: _l = json.loads(line) except: continue _l.update(_l.pop('fields')) _l['time'] = _l.pop('time') req_t = _l.pop("request") remote_addr = _l.get("remote_addr") #if remote_addr == '-': # continue #response = reader.city(remote_addr) #_l['coordinates'] = { # "lon" : response.location.longitude, # "lat" : response.location.latitude #} #_l['geoip'] = { # "country_name" : response.country.name, # "region_name" : response.city.name #} _l.update(grok.match(req_t)) #_l["@timestamp"] = datetime.now().isoformat() #_l["url"] = _l.get("url").replace("/app","") #if _l.get("url").startswith("/app/search/all/"): # _l["url"] = urllib.parse.unquote(_l.get("url")) body = json.dumps(_l) print(_l) r = http.request('post', '127.0.0.1:8885/master/data/input/', headers={"Content-Type": "application/json"}, body=json.dumps(_l))
def test_exim_failure(): """Test exim message failure""" log_entry = '2017-06-18 04:03:59 1dMVBf-0007Wx-F5 ** '\ '[email protected] R=virtual_aliases: No Such User Here' patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \ '../patterns.d') grok_output = Grok('%{SYSLOG_EXIM}', \ custom_patterns_dir=patterns_directory).match(log_entry) assert grok_output['exim_timestamp'] == '2017-06-18 04:03:59' assert grok_output['exim_log_id'] == '1dMVBf-0007Wx-F5' assert grok_output['exim_final_delivery_address'] == '*****@*****.**' assert grok_output['exim_router'] == 'virtual_aliases' assert grok_output['exim_failure_message'] == 'No Such User Here'
def test_exim_smtp(): """Test exim incoming smtp delivery.""" log_entry = '2017-06-24 15:31:43 SMTP connection from mail.example.com '\ '[192.168.1.1]:46890 closed by QUIT' patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \ '../patterns.d') grok_output = Grok('%{SYSLOG_EXIM}', \ custom_patterns_dir=patterns_directory).match(log_entry) assert grok_output['exim_port'] == '46890' assert grok_output['exim_url'] == 'mail.example.com' assert grok_output['exim_ip_address'] == '192.168.1.1' assert grok_output['exim_smtp_message'] == 'closed by QUIT' assert grok_output['exim_timestamp'] == '2017-06-24 15:31:43'
def test_multiple_pats(): text = 'gary 25 "never quit"' pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' grok = Grok(pat) m = grok.match(text) assert m['name'] == 'gary' and m['age'] == '25' and m['motto'] == '"never quit"', \ 'grok match failed:%s, %s' % (text, pat, ) # variable names are not set text = 'gary 25 "never quit"' pat = '%{WORD} %{INT} %{QUOTEDSTRING}' grok = Grok(pat) m = grok.match(text) assert m == {}, 'grok match failed:%s, %s' % (text, pat, ) #"male" is not INT text = 'gary male "never quit"' pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}' grok = Grok(pat) m = grok.match(text) assert m is None, 'grok match failed:%s, %s' % (text, pat, ) # nginx log text = 'edge.v.iask.com.edge.sinastorage.com 14.18.243.65 6.032s - [21/Jul/2014:16:00:02 +0800]' \ + ' "GET /edge.v.iask.com/125880034.hlv HTTP/1.0" 200 70528990 "-"' \ + ' "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \ + ' Chrome/36.0.1985.125 Safari/537.36"' pat = '%{HOSTNAME:host} %{IP:client_ip} %{NUMBER:delay}s - \[%{DATA:time_stamp}\]' \ + ' "%{WORD:verb} %{URIPATHPARAM:uri_path} HTTP/%{NUMBER:http_ver}" %{INT:http_status} %{INT:bytes} %{QS}' \ + ' %{QS:client}' grok = Grok(pat) m = grok.match(text) assert m['host'] == 'edge.v.iask.com.edge.sinastorage.com' and m['client_ip'] == '14.18.243.65' \ and m['delay'] == '6.032' and m['time_stamp'] == '21/Jul/2014:16:00:02 +0800' and m['verb'] == 'GET' \ and m['uri_path'] == '/edge.v.iask.com/125880034.hlv' and m['http_ver'] == '1.0' \ and m['http_status'] == '200' and m['bytes'] == '70528990' \ and m['client'] == '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \ + ' Chrome/36.0.1985.125 Safari/537.36"', 'grok match failed:%s, %s' % (text, pat, ) text = '1989/02/23' pat = '%{NUMBER:birthyear:int}/%{NUMBER:birthmonth:int}/%{NUMBER:birthday:int}' grok = Grok(pat) m = grok.match(text) assert m == {'birthyear': 1989, 'birthmonth': 2, 'birthday': 23}, 'grok match failed:%s, %s' % (text, pat, ) text = 'load average: 1.88, 1.73, 1.49' pat = 'load average: %{NUMBER:load_1:float}, %{NUMBER:load_2:float}, %{NUMBER:load_3:float}' grok = Grok(pat) m = grok.match(text) assert m == {'load_1': 1.88, 'load_2': 1.73, 'load_3': 1.49}, 'grok match failed:%s, %s' % (text, pat, )
def test_one_pat(): text = '1024' pat = '%{INT:test_int}' grok = Grok(pat) m = grok.match(text) assert m['test_int'] == '1024', 'grok match failed:%s, %s' % (text, pat, ) text = '1024' pat = '%{NUMBER:test_num}' grok = Grok(pat) m = grok.match(text) assert m['test_num'] == '1024', 'grok match failed:%s, %s' % (text, pat, ) text = 'garyelephant ' pat = '%{WORD:name} ' grok = Grok(pat) m = grok.match(text) assert m['name'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = '192.168.1.1' pat = '%{IP:ip}' grok = Grok(pat) m = grok.match(text) assert m['ip'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = 'github.com' pat = '%{HOSTNAME:website}' grok = Grok(pat) m = grok.match(text) assert m['website'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = '1989-11-04 05:33:02+0800' pat = '%{TIMESTAMP_ISO8601:ts}' grok = Grok(pat) m = grok.match(text) assert m['ts'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, ) text = 'github' pat = '%{WORD}' grok = Grok(pat) m = grok.match(text) assert m == {}, 'grok match failed:%s, %s' % (text, pat, ) # you get nothing because variable name is not set, compare "%{WORD}" and "%{WORD:variable_name}" text = 'github' pat = '%{NUMBER:test_num}' grok = Grok(pat) m = grok.match(text) assert m is None, 'grok match failed:%s, %s' % (text, pat, ) #not match text = '1989' pat = '%{NUMBER:birthyear:int}' grok = Grok(pat) m = grok.match(text) assert m == {'birthyear': 1989}, 'grok match failed:%s, %s' % (text, pat, )