Exemplo n.º 1
0
    def process(self, data_to_process: str) -> Dict[str, Any]:
        """Transfrom a string in IIS logging style

        sample input:
        2019-09-11 02:02:45 WEBSITE-LIDL-ACCOUNT-PROD-WE GET / X-ARR-LOG-ID=b7fea7b7-7913-43a9-87a6-59bcca58e4b7 443 - 51.105.161.182 - - - website-lidl-account-prod-we.lidl-account-ase-prod-we.p.azurewebsites.net 200 0 0 6180 1065 15

        Uses grok pattern to parse the string.

        Args:
            data_to_process: str containing the IIS log.
        """
        
        pattern = "%{TIMESTAMP_ISO8601:log_timestamp} %{NOTSPACE:sitename} %{WORD:cs_method} %{URIPATH:cs_uri_stem} %{NOTSPACE:cs_uri_query} %{NUMBER:s_port} %{NOTSPACE:cs_username} %{IPORHOST:c_ip} %{NOTSPACE:cs_useragent} %{NOTSPACE:cs_cookie} %{NOTSPACE:cs_referer} %{IPORHOST:cs_host} %{NUMBER:sc_status} %{NUMBER:sc_substatus} %{NUMBER:sc_win32_status} %{NUMBER:sc_bytes} %{NUMBER:cs_bytes} %{NUMBER:time_taken}"
        logger.debug(f"Parsing: {data_to_process}")
        grok = Grok(pattern)
        parsed_data = grok.match(data_to_process)
        dict_for_db = {
            "time": parsed_data["log_timestamp"],
            "status_code": parsed_data["sc_status"],
            "outbound_data": parsed_data["sc_bytes"],
            "inbound_data": parsed_data["cs_bytes"],
            "time_taken": parsed_data["time_taken"],
            "raw_data": json.dumps(parsed_data),
        }
        return dict_for_db
Exemplo n.º 2
0
def test_match_unnamed():
    url = "https://*****:*****@test.com/path?query=1"
    grok = Grok("%{URI}", match_unnamed_groks=True)
    m = grok.match(url)
    assert m["USER"] == "foo"
    assert m["URI"] == url
    assert m["HOSTNAME"] == "test.com"
Exemplo n.º 3
0
    def extract_derived_fields(self, doc_fields):
        """
        Extract derived field based on a document

        :param doc_fields: document information used to extract derived fields
        :return: all derived fields
        """
        derived_fields = dict()
        for field_name, grok_pattern in self.settings.list_derived_fields:
            try:
                # If key doesn't exist, an exception is raise
                doc_value = helpers.utils.get_dotkey_value(
                    doc_fields, field_name, case_sensitive=False)

                if grok_pattern in self.grok_filters.keys():
                    grok = self.grok_filters[grok_pattern]
                else:
                    grok = Grok(grok_pattern)
                    self.grok_filters[grok_pattern] = grok

                match_dict = grok.match(doc_value)

                if match_dict:
                    for match_dict_k, match_dict_v in match_dict.items():
                        derived_fields[match_dict_k] = match_dict_v

            except KeyError:
                pass  # Ignore, value not found...

        return derived_fields
    def make_parser(file: dict, groks_dir: str) -> Parser:
        """
        Builds a parser from a config file entry and a directory of extra grok expressions.
        :param file: A single `file` element from the array within the config file.
        :param groks_dir: A directory to find additional grok patterns
        :return: A parser for the specified configuration
        """

        if not all(["type", "strptime", "path"] for key in file):
            raise ValueError(
                f"File entry requires 'type', 'strptime' and 'path' keys. Found:\n{file}"
            )

        type: str = file["type"]
        strptime_pattern: str = file["strptime"]
        groks = [
            Grok(grok, custom_patterns_dir=groks_dir) for grok in file["path"]
        ]

        if "grok" in file:
            grok_name = file["grok"]
        else:
            # Guess grok from type name...
            # Grok patterns don't support hyphenation
            grok_name = "%%{%s}" % type.upper().replace("-", "")

        return Parser(
            type,
            Grok(grok_name, custom_patterns_dir=groks_dir),
            groks,
            strptime_pattern,
        )
Exemplo n.º 5
0
def date_finder(text):
    date =""
    date_pattern = '%{YEAR:year}-%{MONTHNUM:month}-%{MONTHDAY:day}'
    matches = list(datefinder.find_dates(s))
    match_date = re.search('\d{4}-\d{2}-\d{2}', s)

    try:
        print "====using dateutil"
        for i in s.splitlines():
            d = parser.parse(i)
            print(d.strftime("%Y-%m-%d"))
    except Exception as e:
        print e
    try:
        print "====pygrok==="
        grok = Grok(date_pattern)
        print(grok.match(s))
    except Exception as e:
        print e
    try:
        print "====using date==="
        if len(matches) > 0:
            date = matches[0]
            print date
        else:
            print 'No dates found'
    except Exception as e:
        print e
    try:
        print "====using date==="
        date = datetime.datetime.strptime(match_date.group(), '%Y-%m-%d').date()
        print date
    except Exception as e:
        print e
    try:
        print "====using Chunkgrams==="
        chunkGram = r"""NE:{<NNP>+<CD>}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(sentence.strip()) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(i) for i in tokenized_sentences]
        chunked_sentences = [chunkParser.parse(i) for i in tagged_sentences] 
        entity_names = []
        for tree in chunked_sentences:
            entity_names.extend(extract_entity_names(tree))
        print entity_names
    except Exception as e:
        print e
    try:
        print "===using pydatum=="
        datum = Datum()
        print (datum.from_iso_date_string(text))
    except Exception as e:
        print e
    try:
        print "===using dateparser=="
        date = search_dates(text.decode('ascii','ignore'))
        print date
    except Exception as e:
        print e
Exemplo n.º 6
0
def parse(log):
    grok = Grok(
        "<%{WORD:type}:%{WORD:severity}>%{TIMESTAMP_ISO8601:timestamp}%{SPACE}%{IP:ip}:%{IPORHOST:host}%{SPACE}%{GREEDYDATA:data}"
    )
    l = grok.match(log)
    l["data"] = b64decode(bytes.fromhex(l["data"])).decode()
    return l
Exemplo n.º 7
0
def calc_api_stat_daily(logfile):
    with open(logfile, 'r') as fin:
        lines = fin.readlines()
        pattern = '%{IPORHOST:clientip} - - \[%{HTTPDATE:timestamp}\] "%{WORD:verb} %{URIPATHPARAM:request} HTTP/%{NUMBER:httpversion}" %{NUMBER:response} (?:%{NUMBER:bytes}|-) (?:"(?:%{URI:referrer}|-)"|%{QS:referrer}) %{QS:agent} %{QS:xforwardedfor} %{BASE10NUM:request_duration}'
        grok = Grok(pattern)
        api_stat_dic = {}
        for line in lines:
            res = grok.match(line)
            if res is None:
                continue
            api, duration = res['request'].split('?')[0], float(
                res['request_duration']) * 1000
            api = merge_api(api)
            val = api_stat_dic.get(api, None)
            if val is None:
                api_stat_dic[api] = {
                    'cnt': 1,
                    'avg': duration,
                    'max': duration,
                    'min': duration
                }
            else:
                val['max'] = max(val['max'], duration)
                val['min'] = min(val['min'], duration)
                val['avg'] = (val['avg'] * val['cnt'] +
                              duration) / (val['cnt'] + 1)
                val['cnt'] += 1
            #pprint.pprint(api_stat_dic)
        for k, v in api_stat_dic.items():
            print('{}\t{}\t{}\t{}\t{}'.format(k, v['cnt'], round(v['avg'], 0),
                                              v['max'], v['min']))
Exemplo n.º 8
0
def parse_base(syslog):
    pattern = "<%{NUMBER:pri:int}>(?<logTime>(%{MONTH} +%{MONTHDAY} %{TIME}( %{YEAR})?|%{MONTH} +%{MONTHDAY} %{YEAR} %{TIME})) %{DATA:loghostname} %%%{NUMBER}%{DATA:module}/%{NUMBER:severity:int}/%{DATA:logTypeDesc}:(( -%{DATA:location};)?|(s)?) +%{GREEDYDATA:desc}"
    grok = Grok(pattern)
    raw_log = syslog["message"]
    parsed_log = grok.match(raw_log)
    if parsed_log:
        # return parsed_log
        syslog.update(parsed_log)
Exemplo n.º 9
0
def test_custom_pats():
    custom_pats = {'ID': '%{WORD}-%{INT}'}
    text = 'Beijing-1104,gary 25 "never quit"'
    pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat, custom_patterns=custom_pats)
    m = grok.match(text)
    assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \
        and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 10
0
def test_custom_pats():
    custom_pats = {'ID': '%{WORD}-%{INT}'}
    text = 'Beijing-1104,gary 25 "never quit"'
    pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat, custom_patterns=custom_pats)
    m = grok.match(text)
    assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \
        and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 11
0
    def do_sync(self):
        """Read data out of text files and write it to stdout following Singer spec"""

        LOGGER.info("Extracting data")

        # read/persist the file in batches
        RECORDS_PER_BATCH = 100

        for dirpath, d in sorted(self.directories.items()):
            dirname = d['dirname']
            key_properties = ['_singer_gen_key'] if self.rec_hash_keys else []
            LOGGER.info('Writing schema for `{}`'.format(dirname))
            singer.write_schema(dirname, d['schema'], key_properties)
            if len(d['files']) > 0:
                LOGGER.info('Extracting data from `{}`'.format(dirname))
                lines_to_write = []
                for f in sorted(d['files'], key=lambda x: x['filename']):
                    if self.file_format == 'jsonl':
                        for line in open(f['absolute_path'], 'r'):
                            parsed_line = json.loads(line)
                            parsed_line = self._add_key_to_rec(
                                parsed_line, line)
                            lines_to_write.append(parsed_line)
                            if len(lines_to_write) >= RECORDS_PER_BATCH:
                                singer.write_records(dirname, lines_to_write)
                                lines_to_write = []
                    elif self.file_format == 'csv':
                        for df in pd.read_csv(f['absolute_path'],
                                              parse_dates=False,
                                              chunksize=1):
                            rec = df.to_dict('records')[0]
                            rec = self._add_key_to_rec(rec)
                            lines_to_write.append(rec)
                            if len(lines_to_write) >= RECORDS_PER_BATCH:
                                singer.write_records(dirname, lines_to_write)
                                lines_to_write = []
                    elif self.file_format == 'log':
                        # TODO Use pattern per table and get it not from config
                        grok = Grok(CONFIG['grok_pattern'])
                        for line in open(f['absolute_path'], 'r'):
                            parsed_line = grok.match(line)
                            if not parsed_line:
                                parsed_line = {}
                            parsed_line['_sdc_raw_log_line'] = line
                            lines_to_write.append(parsed_line)
                            if len(lines_to_write) >= RECORDS_PER_BATCH:
                                singer.write_records(dirname, lines_to_write)
                                lines_to_write = []

                    singer.write_records(d['dirname'], lines_to_write)
                    lines_to_write = []

                singer.write_records(d['dirname'], lines_to_write)
            LOGGER.info('Writing state for `{}`'.format(dirname))
            singer.write_state(self.state)

        LOGGER.info('Writing final state')
        singer.write_state(self.state)
Exemplo n.º 12
0
def test_predefined_patterns():
    grok = Grok("%{DATA}")
    errors = []
    for pattern in grok.predefined_patterns:
        try:
            g = Grok("%{" + pattern + "}")
        except Exception as e:
            errors.append((pattern, str(e)))
    assert errors == []
Exemplo n.º 13
0
    def build_schemas(self):
        """Do a pass over the files and use GenSon to generate their schemas"""

        # TODO add sampling so that we don't have to pass over every single record

        LOGGER.info('Building schemas')

        if not self.state.get('schemas'):
            self.state['schemas'] = {}

        for dirpath, d in self.directories.items():
            dirname = d['dirname']
            LOGGER.info('Building schema for `{}`'.format(dirname))
            schema_builder = SchemaBuilder()

            if not self.state['schemas'].get(dirname):
                self.state['schemas'][dirname] = {
                    "type": "object",
                    "properties": {}
                }
            else:
                LOGGER.info(
                    "Existing schema for `{}` will be used as seed schema".
                    format(dirname))

            schema_builder.add_schema(self.state['schemas'][dirname])

            for f in d['files']:
                if self.file_format == 'jsonl':
                    for line in open(f['absolute_path'], 'r'):
                        parsed_line = json.loads(line)
                        parsed_line = self._add_key_to_rec(parsed_line, line)
                        schema_builder.add_object(parsed_line)
                elif self.file_format == 'csv':
                    # Note: parsing dates is pointless until date formatting support in GenSon
                    for df in pd.read_csv(f['absolute_path'],
                                          parse_dates=False,
                                          chunksize=1):
                        rec = df.to_dict('records')[0]
                        rec = self._add_key_to_rec(rec)
                        schema_builder.add_object(rec)
                elif self.file_format == 'log':
                    # TODO Use pattern per table and get it not from config
                    grok = Grok(CONFIG['grok_pattern'])
                    for line in open(f['absolute_path'], 'r'):
                        parsed_line = grok.match(line)
                        if not parsed_line:
                            parsed_line = {}
                        parsed_line['_sdc_raw_log_line'] = line
                        schema_builder.add_object(parsed_line)

            self.directories[dirpath]['schema'] = schema_builder.to_schema()
            self.state['schemas'][dirname] = self.directories[dirpath][
                'schema']

        LOGGER.info('Done building schemas')
Exemplo n.º 14
0
def test_custom_pat_files():
    import os.path
    pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_patterns')
    text = 'Beijing-1104,gary 25 "never quit"'
    # pattern "ID" is defined in ./test_patterns/pats
    pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat, custom_patterns_dir=pats_dir)
    m = grok.match(text)
    assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \
        and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 15
0
def parse_line(line):
    # pattern = '【%{NOTSPACE:type}】%{NOTSPACE:id},%{INT:month}月%{INT:date}日%{NOTSPACE:train_number}次%{INT:train_id}车%{INT:seat_id}\s*'
    pattern = '【%{NOTSPACE:type}】'
    type = Grok(pattern).match(line)

    if type["type"] == '12306':
        # pattern2 = "%{INT:month}月%{INT:day}日%{NOTSPACE:train_number}次%{INT:train_id}车%{INT:seat_id},%{NOTSPACE:site_name}站%{HOUR:hour}:%{MINUTE:minute}开,%{NOTSPACE:ticket_gate}。"
        pattern2 = "%{INT:month}月%{INT:day}日%{NOTSPACE:train_number}次%{NOTSPACE:seat_info},%{NOTSPACE:site_name}站%{HOUR:hour}:%{MINUTE:min}开%{NOTSPACE}口%{NOTSPACE:ticket_gate}。"
        info = Grok(pattern2).match(line)
    return info
Exemplo n.º 16
0
def test_custom_pat_files():
    import os.path
    pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_patterns')
    text = 'Beijing-1104,gary 25 "never quit"'
    #pattern "ID" is defined in ./test_patterns/pats
    pat = '%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat, custom_patterns_dir = pats_dir)
    m = grok.match(text)
    assert m['user_id'] == 'Beijing-1104' and m['name'] == 'gary' and m['age'] == '25' \
        and m['motto'] == '"never quit"', 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 17
0
def test_struct_pats():
    import json
    custom_pats = {
        "POWER": "(?:%{NUMBER}|(-inf))",
        "HEX": "(?:(0[x,X])*(?:[0-9a-zA-Z]+))",
        "HEXENUM": "%{HEX:v}[ ]?",
        "HEXLINE": "%{HEXENUM:col:arr}\n?"
    }

    text = json.dumps({"a": {"c": {"d": 4}, "e": [1, 2, 3]}})
    custom_pats.update({"EA": "%{NUMBER:v:int}(,\s*)*"})
    custom_pats.update({"E": "(.*): \[%{EA:e:arr}\]"})
    grok = Grok("(.*): \[%{EA:e:arr}\]",
                custom_patterns=custom_pats,
                fullmatch=False)
    m = grok.match(text)
    assert m['e'][0]['v'] == 1, print(m['e'][0]['v'])
    assert m['e'][1]['v'] == 2, print(m['e'][1]['v'])
    assert m['e'][2]['v'] == 3, print(m['e'][2]['v'])

    custom_pats.update({"C": "{%{QS}: {%{QS}: %{NUMBER:d:int}}"})
    grok = Grok("%{C:c}", custom_patterns=custom_pats, fullmatch=False)
    m = grok.match(text)
    assert m['c_d'] == 4, print(m['c_d'])

    custom_pats.update({"A": "{(.*): %{C:c}, (.*): \[%{EA:e:arr}\]}"})

    grok = Grok("%{A:a}", custom_patterns=custom_pats, fullmatch=False)

    m = grok.match(text)
    assert m['a_c_d'] == 4, print(m['a_c_d'])
    assert m['a_e'][0]['v'] == 1, print(m['a_c_e'][0]['v'])
    assert m['a_e'][1]['v'] == 2, print(m['a_c_e'][1]['v'])
    assert m['a_e'][2]['v'] == 3, print(m['a_c_e'][2]['v'])
Exemplo n.º 18
0
def scrape_rotations():
	##### Get the URL for each season's wiki page ####
	# Grab HTML
	soup = BeautifulSoup(requests.get(CHAMPION_ROTATION_ARCHIVE_URL).text, 'html.parser')

	# Access list of past champion rotations for each season
	htmlList = soup.find('div', class_='WikiaArticle').find_all('ul')[1].find_all('li')
	hrefList = []
	for l in htmlList:
		href = str(l.find('a')['href'])
		season = str(href.split('/')[-1])
		hrefList.append(WIKI_URL + href + "#" + "Pre-" + season)
		hrefList.append(WIKI_URL + href + "#" + season)

	#### Get champion and date information for each wiki ####
	week_number = 0
	date_pattern = '%{MONTHDAY:day} %{MONTH:month} %{YEAR:year}'
	grok = Grok(date_pattern)

	for url in hrefList:
		# Grab HTML
		soup = BeautifulSoup(requests.get(url).text, 'html.parser')
		# Get the table for each week of the pre-season/season
		tables = soup.find_all('table', class_='wikitable')

		# Go through each week for the pre-season/season
		for table in tables:
			# Increment the week number
			week_number += 1

			# Get the dates for that week
			dates = table.find('p').getText().replace(u'\xa0', ' ').replace(u'\n', ' ').split(" - ")
			start_date = to_date(grok.match(dates[0]))
			end_date = to_date(grok.match(dates[1]))

			# Rotation attributes dictionary
			rotation_attributes = {
				"week_number": week_number,
				"start_date": start_date,
				"end_date": end_date
			}
			rotation = Rotation(**rotation_attributes)

			# Get the list of champions for that week
			for champ in table.find_all("div", {"data-game": "lol"}):
				champion = Champion.query.filter(Champion.name == champ.find('a')['title']).first()
				if champion:
					rotation.champions.append(champion)
			
			# Add to DB session
			db.session.add(rotation)

	# Save to the DB
	db.session.commit()
Exemplo n.º 19
0
def grok_misses_from_files(files,
                           grok_pattern,
                           custom_patterns={},
                           header_lines=0):
    grok = Grok(grok_pattern, custom_patterns=custom_patterns)
    pred = all_pass(
        [skip_lines(header_lines), lambda x: grok.match(x[1]) is None])
    all_misses = []
    for file in files:
        all_misses.append(filter_file_with_pred(file, pred))

    return all_misses
Exemplo n.º 20
0
def test_custom_pats():
    custom_pats = {"ID": "%{WORD}-%{INT}"}
    text = 'Beijing-1104,gary 25 "never quit"'
    pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}"
    grok = Grok(pat, custom_patterns=custom_pats)
    m = grok.match(text)
    assert (m["user_id"] == "Beijing-1104" and m["name"] == "gary"
            and m["age"] == "25"
            and m["motto"] == '"never quit"'), "grok match failed:%s, %s" % (
                text,
                pat,
            )
Exemplo n.º 21
0
def test_hotloading_pats():
    text = 'github'
    pat = '%{WORD:test_word}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['test_word'] == 'github', 'grok match failed:%s, %s' % (text, pat, )
    # matches

    text = '1989'
    pat = '%{NUMBER:birthyear:int}'
    grok.set_search_pattern(pat)
    m = grok.match(text)
    assert m == {'birthyear': 1989}, 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 22
0
 def insert_log_data(self, data):
     print("Line ------ : {}".format(data))
     for line in data:
         for phrase in self.keep_phrases:
             if phrase in line:
                 pattern = '%{TIMESTAMP_ISO8601:timestamp}%{SPACE}(\[%{WORD:pid}%{SPACE}%{POSINT:pid}])%{SPACE}(\[%{NUMBER:responsetime}?ms])%{SPACE}(\[%{WORD:uid}\s+%{WORD:uidname}])%{SPACE}(\[%{LOGLEVEL:loglevel}])%{SPACE}(%{URIPATHPARAM:request})%{SPACE}%{GREEDYDATA:syslog_message}'
                 grok = Grok(pattern)
                 grok_json = grok.match(line)
                 post_id = self.insert_data(grok_json)
                 print("Grok json : {}".format(grok_json))
                 # post_id = self.collection.insert_one(grok_json).inserted_id
                 print(post_id)
                 self.format_single_doc(post_id, grok_json)
Exemplo n.º 23
0
def get_saml_response(driver, debug=False, sleeptime=0.5):
    # Grok for parsing xml should redo with bsoup4
    while True:
        time.sleep(sleeptime)
        for entry in driver.get_log('performance'):
            if 'samlresponse' in str(entry).lower() and '"documenturl":"https://signin.aws.amazon.com/saml"' in str(
                    entry).lower():
                if debug == True:
                    print("Saml matches: " + str(entry))
                from pygrok import Grok
                pattern = '%{GREEDYDATA}SAMLResponse=%{DATA:samlresponse}&%{GREEDYDATA}"%{GREEDYDATA}'
                grok = Grok(pattern)
                saml_resp_enc = grok.match(str(entry))['samlresponse']
                saml_resp_dec = unquote(saml_resp_enc)
                return saml_resp_dec
Exemplo n.º 24
0
    def from_file_path(cls, path):
        pattern = '^resource "%{WORD:resource_type}" "%{DATA:resource_name}" {'
        grok = Grok(pattern)
        with open(path, "r") as f:
            lines = f.readlines()

        tf_resources = []

        for line in lines:
            res = grok.match(line)
            if res is not None and "resource_type" in res and "resource_name" in res:
                tf_resources.append(TFResource(**res))
        if len(tf_resources) > 0:
            return cls(tf_resources)
        return cls(tf_resources)
Exemplo n.º 25
0
def test_custom_pat_files():
    import os.path

    pats_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            "test_patterns")
    text = 'Beijing-1104,gary 25 "never quit"'
    # pattern "ID" is defined in ./test_patterns/pats
    pat = "%{ID:user_id},%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}"
    grok = Grok(pat, custom_patterns_dir=pats_dir)
    m = grok.match(text)
    assert (m["user_id"] == "Beijing-1104" and m["name"] == "gary"
            and m["age"] == "25"
            and m["motto"] == '"never quit"'), "grok match failed:%s, %s" % (
                text,
                pat,
            )
Exemplo n.º 26
0
def test_exim_received():
    """Test exim incoming email"""
    log_entry = '2017-06-24 15:41:05 1dOqvZ-0002HB-Ob <= '\
                '[email protected] H=in3f.electric.net '\
                '[72.35.12.46]:47218 P=esmtps X=TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256 '\
                'CV=no S=6297 T="Millionaires, CEOs, Entrepreneurs are using brain pills '\
                'to boost intelligence." for [email protected]'
    patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
                            '../patterns.d')
    grok_output = Grok('%{SYSLOG_EXIM}', \
                    custom_patterns_dir=patterns_directory).match(log_entry)
    assert grok_output['exim_timestamp'] == '2017-06-24 15:41:05'
    assert grok_output['exim_log_id'] == '1dOqvZ-0002HB-Ob'
    assert grok_output['exim_remote_host'] == 'in3f.electric.net'
    assert grok_output['exim_remote_ip'] == '72.35.12.46'
    assert grok_output[
        'exim_tls_cipher'] == 'TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256'
    assert grok_output['exim_tls_certificate_verified'] == 'no'
    assert grok_output[
        'exim_sender_address'] == '*****@*****.**'
    assert grok_output['exim_port'] == '47218'
    assert grok_output['exim_protocol'] == 'esmtps'
    assert grok_output['exim_message_size'] == '6297'
    assert grok_output['exim_subject'] == 'Millionaires, CEOs, Entrepreneurs '\
                                        'are using brain pills to boost intelligence.'
Exemplo n.º 27
0
def extractAndStripDateFromStory(df):
    stories = list(df['story'])
    date_pattern = '%{MONTH:month} %{YEAR:year}'
    p = ''
    month = []
    story = []

    for s in stories:
        grok = Grok(date_pattern)
        d = grok.match(s)
        dt = p.join(d['month'] + " " + d['year'] + ".")
        m = d['month']
        month.append(m)
        st = re.sub(dt, '', s)
        story.append(st)
    df['story'] = story
    df['Month'] = month
Exemplo n.º 28
0
    def extract_derived_fields(self, doc_fields):
        derived_fields = dict()
        for field_name, grok_pattern in self.settings.config.items("derivedfields"):
            if helpers.utils.dict_contains_dotkey(doc_fields, field_name, case_sensitive=False):
                if grok_pattern in self.grok_filters.keys():
                    grok = self.grok_filters[grok_pattern]
                else:
                    grok = Grok(grok_pattern)
                    self.grok_filters[grok_pattern] = grok

                match_dict = grok.match(helpers.utils.get_dotkey_value(doc_fields, field_name, case_sensitive=False))

                if match_dict:
                    for match_dict_k, match_dict_v in match_dict.items():
                        derived_fields[match_dict_k] = match_dict_v

        return derived_fields
Exemplo n.º 29
0
def test_exim_smtp_outgoing():
    """Test exim outgoing smtp"""
    log_entry = '2017-06-24 23:36:21 1dOyLV-000359-20 SMTP connection outbound '\
                '1498361781 1dOyLV-000359-20 domain.com [email protected]'
    patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
                            '../patterns.d')
    grok_output = Grok('%{SYSLOG_EXIM}', \
                    custom_patterns_dir=patterns_directory).match(log_entry)
    assert grok_output['exim_log_id'] == '1dOyLV-000359-20'
    assert grok_output['exim_sender'] == 'domain.com'
    assert grok_output['exim_external_recipient'] == '*****@*****.**'
    assert grok_output['exim_timestamp'] == '2017-06-24 23:36:21'
Exemplo n.º 30
0
def test_arr_pats():
    custom_pats = {
        "POWER": "(?:%{NUMBER}|(-inf))",
        "HEX": "(?:(0[x,X])*(?:[0-9a-zA-Z]+))",
        "HEXENUM": "%{HEX:v}[ ]?",
        "HEXLINE": "%{HEXENUM:col:arr}\n?"
    }

    text = '0x1 0x2 0x3'
    pat = '%{HEXLINE:row}'
    grok = Grok(pat, custom_patterns=custom_pats, fullmatch=False)
    m = grok.match(text)
    assert (m['row'] == '0x1 0x2 0x3' and m['row_col']['_str'] == '0x1 0x2 0x3'
            and m['row_col'][0]["v"] == '0x1' and m['row_col'][1]["v"] == '0x2'
            and m['row_col'][2]["v"] == '0x3'), 'grok match failed:%s, %s' % (
                text,
                pat,
            )
    # matches

    text = '0x1 0x2 0x3\n0x4 0x5 0x6\n0x7 0x8 0x9'
    pat = '%{HEXLINE:row:arr}'
    grok.set_search_pattern(pat)
    m = grok.match(text)
    for i in range(2):
        for j in range(3):
            assert m['row'][i]['col'][j]['v'] == "0x" + "%d" % (
                i * 3 + j + 1), 'grok match failed:%s, %s' % (
                    text,
                    pat,
                )
Exemplo n.º 31
0
def inputa():
    req_pattern = "%{WORD:method} %{URIPATHPARAM:url} %{WORD:version}"
    grok = Grok(req_pattern)
    f_n = 'access.log-2017-10-17-1508216401.log'
    f = open(f_n, "r")
    http = urllib3.PoolManager()
    while True:
        line = f.readline()
        if not line: break
        try:
            _l = json.loads(line)
        except:
            continue
        _l.update(_l.pop('fields'))
        _l['time'] = _l.pop('time')
        req_t = _l.pop("request")
        remote_addr = _l.get("remote_addr")

        #if remote_addr == '-':
        #    continue
        #response = reader.city(remote_addr)
        #_l['coordinates'] = {
        #    "lon" : response.location.longitude,
        #    "lat" : response.location.latitude
        #}
        #_l['geoip'] = {
        #    "country_name" : response.country.name,
        #    "region_name" : response.city.name
        #}
        _l.update(grok.match(req_t))
        #_l["@timestamp"] = datetime.now().isoformat()
        #_l["url"] = _l.get("url").replace("/app","")
        #if _l.get("url").startswith("/app/search/all/"):
        #    _l["url"] = urllib.parse.unquote(_l.get("url"))
        body = json.dumps(_l)
        print(_l)
        r = http.request('post',
                         '127.0.0.1:8885/master/data/input/',
                         headers={"Content-Type": "application/json"},
                         body=json.dumps(_l))
Exemplo n.º 32
0
def test_exim_failure():
    """Test exim message failure"""
    log_entry = '2017-06-18 04:03:59 1dMVBf-0007Wx-F5 ** '\
                '[email protected] R=virtual_aliases: No Such User Here'
    patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
                            '../patterns.d')
    grok_output = Grok('%{SYSLOG_EXIM}', \
                    custom_patterns_dir=patterns_directory).match(log_entry)
    assert grok_output['exim_timestamp'] == '2017-06-18 04:03:59'
    assert grok_output['exim_log_id'] == '1dMVBf-0007Wx-F5'
    assert grok_output['exim_final_delivery_address'] == '*****@*****.**'
    assert grok_output['exim_router'] == 'virtual_aliases'
    assert grok_output['exim_failure_message'] == 'No Such User Here'
Exemplo n.º 33
0
def test_exim_smtp():
    """Test exim incoming smtp delivery."""
    log_entry = '2017-06-24 15:31:43 SMTP connection from mail.example.com '\
                '[192.168.1.1]:46890 closed by QUIT'
    patterns_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
                            '../patterns.d')
    grok_output = Grok('%{SYSLOG_EXIM}', \
                    custom_patterns_dir=patterns_directory).match(log_entry)
    assert grok_output['exim_port'] == '46890'
    assert grok_output['exim_url'] == 'mail.example.com'
    assert grok_output['exim_ip_address'] == '192.168.1.1'
    assert grok_output['exim_smtp_message'] == 'closed by QUIT'
    assert grok_output['exim_timestamp'] == '2017-06-24 15:31:43'
Exemplo n.º 34
0
def test_multiple_pats():
    text = 'gary 25 "never quit"'
    pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['name'] == 'gary' and m['age'] == '25' and m['motto'] == '"never quit"', \
        'grok match failed:%s, %s' % (text, pat, )

    # variable names are not set
    text = 'gary 25 "never quit"'
    pat = '%{WORD} %{INT} %{QUOTEDSTRING}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m == {}, 'grok match failed:%s, %s' % (text, pat, )

    #"male" is not INT
    text = 'gary male "never quit"'
    pat = '%{WORD:name} %{INT:age} %{QUOTEDSTRING:motto}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m is None, 'grok match failed:%s, %s' % (text, pat, )

    # nginx log
    text = 'edge.v.iask.com.edge.sinastorage.com 14.18.243.65 6.032s - [21/Jul/2014:16:00:02 +0800]' \
        + ' "GET /edge.v.iask.com/125880034.hlv HTTP/1.0" 200 70528990 "-"' \
        + ' "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \
        + ' Chrome/36.0.1985.125 Safari/537.36"'
    pat = '%{HOSTNAME:host} %{IP:client_ip} %{NUMBER:delay}s - \[%{DATA:time_stamp}\]' \
        + ' "%{WORD:verb} %{URIPATHPARAM:uri_path} HTTP/%{NUMBER:http_ver}" %{INT:http_status} %{INT:bytes} %{QS}' \
        + ' %{QS:client}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['host'] == 'edge.v.iask.com.edge.sinastorage.com' and m['client_ip'] == '14.18.243.65' \
        and m['delay'] == '6.032' and m['time_stamp'] == '21/Jul/2014:16:00:02 +0800' and m['verb'] == 'GET' \
        and m['uri_path'] == '/edge.v.iask.com/125880034.hlv' and m['http_ver'] == '1.0' \
        and m['http_status'] == '200' and m['bytes'] == '70528990' \
        and m['client'] == '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' \
        + ' Chrome/36.0.1985.125 Safari/537.36"', 'grok match failed:%s, %s' % (text, pat, )

    text = '1989/02/23'
    pat = '%{NUMBER:birthyear:int}/%{NUMBER:birthmonth:int}/%{NUMBER:birthday:int}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m == {'birthyear': 1989, 'birthmonth': 2, 'birthday': 23}, 'grok match failed:%s, %s' % (text, pat, )

    text = 'load average: 1.88, 1.73, 1.49'
    pat = 'load average: %{NUMBER:load_1:float}, %{NUMBER:load_2:float}, %{NUMBER:load_3:float}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m == {'load_1': 1.88, 'load_2': 1.73, 'load_3': 1.49}, 'grok match failed:%s, %s' % (text, pat, )
Exemplo n.º 35
0
def test_one_pat():
    text = '1024'
    pat = '%{INT:test_int}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['test_int'] == '1024', 'grok match failed:%s, %s' % (text, pat, )

    text = '1024'
    pat = '%{NUMBER:test_num}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['test_num'] == '1024', 'grok match failed:%s, %s' % (text, pat, )

    text = 'garyelephant '
    pat = '%{WORD:name} '
    grok = Grok(pat)
    m = grok.match(text)
    assert m['name'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, )

    text = '192.168.1.1'
    pat = '%{IP:ip}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['ip'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, )

    text = 'github.com'
    pat = '%{HOSTNAME:website}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['website'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, )

    text = '1989-11-04 05:33:02+0800'
    pat = '%{TIMESTAMP_ISO8601:ts}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m['ts'] == text.strip(), 'grok match failed:%s, %s' % (text, pat, )

    text = 'github'
    pat = '%{WORD}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m == {}, 'grok match failed:%s, %s' % (text, pat, )
    # you get nothing because variable name is not set, compare "%{WORD}" and "%{WORD:variable_name}"

    text = 'github'
    pat = '%{NUMBER:test_num}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m is None, 'grok match failed:%s, %s' % (text, pat, )
    #not match

    text = '1989'
    pat = '%{NUMBER:birthyear:int}'
    grok = Grok(pat)
    m = grok.match(text)
    assert m == {'birthyear': 1989}, 'grok match failed:%s, %s' % (text, pat, )