Пример #1
0
    def test_regular_expression(self):
        validator = validators.RegularExpression()

        # duck-type: act like it's a regex and allow failure if it isn't one
        validator.__call__('a').match('a')

        self.assertEqual(validator.__call__(None), None)
        self.assertRaises(ValueError, validator.__call__, '(a')
class StubbedReportingCommand(ReportingCommand):
    boolean = Option(
        doc='''
        **Syntax:** **boolean=***<value>*
        **Description:** A boolean value''',
        require=False, validate=validators.Boolean())

    duration = Option(
        doc='''
        **Syntax:** **duration=***<value>*
        **Description:** A length of time''',
        validate=validators.Duration())

    fieldname = Option(
        doc='''
        **Syntax:** **fieldname=***<value>*
        **Description:** Name of a field''',
        validate=validators.Fieldname())

    file = Option(
        doc='''
        **Syntax:** **file=***<value>*
        **Description:** Name of a file''',
        validate=validators.File(mode='r'))

    integer = Option(
        doc='''
        **Syntax:** **integer=***<value>*
        **Description:** An integer value''',
        validate=validators.Integer())

    optionname = Option(
        doc='''
        **Syntax:** **optionname=***<value>*
        **Description:** The name of an option (used internally)''',
        validate=validators.OptionName())

    regularexpression = Option(
        doc='''
        **Syntax:** **regularexpression=***<value>*
        **Description:** Regular expression pattern to match''',
        validate=validators.RegularExpression())

    set = Option(
        doc='''
        **Syntax:** **set=***<value>*
        **Description:** Regular expression pattern to match''',
        validate=validators.Set("foo", "bar", "test"))

    @Configuration()
    def map(self, records):
        pass

    def reduce(self, records):
        pass
Пример #3
0
class CountMatchesCommand(StreamingCommand):
    """ Counts the number of non-overlapping matches to a regular expression in
    a set of fields.

    ##Syntax

    .. code-block::
        countmatches fieldname=<field> pattern=<regular_expression> <field-list>

    ##Description

    A count of the number of non-overlapping matches to the regular expression
    specified by `pattern` is computed for each record processed. The result
    is stored in the field specified by `fieldname`. If `fieldname` exists,
    its value is replaced. If `fieldname` does not exist, it is created.
    Event records are otherwise passed through to the next pipeline processor
    unmodified.

    ##Example

    Count the number of words in the `text` of each tweet in tweets.csv and
    store the result in `word_count`.

    .. code-block::
        | inputcsv tweets.csv | countmatches fieldname=word_count
        pattern="\\w+" text

    """
    fieldname = Option(doc='''
        **Syntax:** **fieldname=***<fieldname>*
        **Description:** Name of the field that will hold the match count''',
                       require=True,
                       validate=validators.Fieldname())

    pattern = Option(doc='''
        **Syntax:** **pattern=***<regular-expression>*
        **Description:** Regular expression pattern to match''',
                     require=True,
                     validate=validators.RegularExpression())

    def stream(self, records):
        self.logger.debug('CountMatchesCommand: %s' %
                          self)  # logs command line
        for record in records:
            count = 0.0
            for fieldname in self.fieldnames:
                matches = self.pattern.finditer(str(record[fieldname]))
                count += len(list(matches))
            record[self.fieldname] = count
            yield record
Пример #4
0
class SearchTableCommand(StreamingCommand):
    pattern = Option(doc='''
        **Syntax:** **pattern=***<regular-expression>*
        **Description:** Regular expression pattern to match''',
                     require=False,
                     validate=validators.RegularExpression())

    def stream(self, records):
        #pydevd.settrace()
        self.logger.setLevel(logging.DEBUG)
        self.logger.debug('SearchTableCommand: %s' % self)  # logs command line
        for record in records:
            found = "false"
            for field in record:
                matches = len(list(self.pattern.finditer(str(record[field]))))
                if matches > 0:
                    found = "true"
            if found == "true":
                yield record
        self.logger.debug('SearchTableCommand: Done')
Пример #5
0
class TestSearchCommand(SearchCommand):

    boolean = Option(
        doc='''
        **Syntax:** **boolean=***<value>*
        **Description:** A boolean value''',
        validate=validators.Boolean())

    required_boolean = Option(
        doc='''
        **Syntax:** **boolean=***<value>*
        **Description:** A boolean value''',
        require=True, validate=validators.Boolean())

    aliased_required_boolean = Option(
        doc='''
        **Syntax:** **boolean=***<value>*
        **Description:** A boolean value''',
        name='foo', require=True, validate=validators.Boolean())

    code = Option(
        doc='''
        **Syntax:** **code=***<value>*
        **Description:** A Python expression, if mode == "eval", or statement, if mode == "exec"''',
        validate=validators.Code())

    required_code = Option(
        doc='''
        **Syntax:** **code=***<value>*
        **Description:** A Python expression, if mode == "eval", or statement, if mode == "exec"''',
        require=True, validate=validators.Code())

    duration = Option(
        doc='''
        **Syntax:** **duration=***<value>*
        **Description:** A length of time''',
        validate=validators.Duration())

    required_duration = Option(
        doc='''
        **Syntax:** **duration=***<value>*
        **Description:** A length of time''',
        require=True, validate=validators.Duration())

    fieldname = Option(
        doc='''
        **Syntax:** **fieldname=***<value>*
        **Description:** Name of a field''',
        validate=validators.Fieldname())

    required_fieldname = Option(
        doc='''
        **Syntax:** **fieldname=***<value>*
        **Description:** Name of a field''',
        require=True, validate=validators.Fieldname())

    file = Option(
        doc='''
        **Syntax:** **file=***<value>*
        **Description:** Name of a file''',
        validate=validators.File())

    required_file = Option(
        doc='''
        **Syntax:** **file=***<value>*
        **Description:** Name of a file''',
        require=True, validate=validators.File())

    integer = Option(
        doc='''
        **Syntax:** **integer=***<value>*
        **Description:** An integer value''',
        validate=validators.Integer())

    required_integer = Option(
        doc='''
        **Syntax:** **integer=***<value>*
        **Description:** An integer value''',
        require=True, validate=validators.Integer())

    map = Option(
        doc='''
        **Syntax:** **map=***<value>*
        **Description:** A mapping from one value to another''',
        validate=validators.Map(foo=1, bar=2, test=3))

    required_map = Option(
        doc='''
        **Syntax:** **map=***<value>*
        **Description:** A mapping from one value to another''',
        require=True, validate=validators.Map(foo=1, bar=2, test=3))

    match = Option(
        doc='''
        **Syntax:** **match=***<value>*
        **Description:** A value that matches a regular expression pattern''',
        validate=validators.Match('social security number', r'\d{3}-\d{2}-\d{4}'))

    required_match = Option(
        doc='''
        **Syntax:** **required_match=***<value>*
        **Description:** A value that matches a regular expression pattern''',
        require=True, validate=validators.Match('social security number', r'\d{3}-\d{2}-\d{4}'))

    optionname = Option(
        doc='''
        **Syntax:** **optionname=***<value>*
        **Description:** The name of an option (used internally)''',
        validate=validators.OptionName())

    required_optionname = Option(
        doc='''
        **Syntax:** **optionname=***<value>*
        **Description:** The name of an option (used internally)''',
        require=True, validate=validators.OptionName())

    regularexpression = Option(
        doc='''
        **Syntax:** **regularexpression=***<value>*
        **Description:** Regular expression pattern to match''',
        validate=validators.RegularExpression())

    required_regularexpression = Option(
        doc='''
        **Syntax:** **regularexpression=***<value>*
        **Description:** Regular expression pattern to match''',
        require=True, validate=validators.RegularExpression())

    set = Option(
        doc='''
        **Syntax:** **set=***<value>*
        **Description:** A member of a set''',
        validate=validators.Set('foo', 'bar', 'test'))

    required_set = Option(
        doc='''
        **Syntax:** **set=***<value>*
        **Description:** A member of a set''',
        require=True, validate=validators.Set('foo', 'bar', 'test'))

    class ConfigurationSettings(SearchCommand.ConfigurationSettings):
        @classmethod
        def fix_up(cls, command_class):
            pass
    def test_regular_expression(self):

        validator = validators.RegularExpression()
        self.assertIsInstance(validator.__call__('a'), re._pattern_type)
        self.assertEqual(validator.__call__(None), None)
        self.assertRaises(ValueError, validator.__call__, '(a')
Пример #7
0
class CountMatchesCommand(StreamingCommand):
    """ Counts the number of non-overlapping matches to a regular expression in a set of fields.

    ##Syntax

    .. code-block::
        countmatches fieldname=<field> pattern=<regular_expression> <field-list>

    ##Description

    A count of the number of non-overlapping matches to the regular expression specified by `pattern` is computed for
    each record processed. The result is stored in the field specified by `fieldname`. If `fieldname` exists, its value
    is replaced. If `fieldname` does not exist, it is created. Event records are otherwise passed through to the next
    pipeline processor unmodified.

    ##Example

    Count the number of words in the `text` of each tweet in tweets.csv and store the result in `word_count`.

    .. code-block::
        | inputlookup tweets | countmatches fieldname=word_count pattern="\\w+" text

    """
    fieldname = Option(
        doc='''
        **Syntax:** **fieldname=***<fieldname>*
        **Description:** Name of the field that will hold the match count''',
        require=True, validate=validators.Fieldname())

    outname = Option(
        doc='''
        **Syntax:** **outname=***<outname>*
        **Description:** Name of the outpuf field that will hold the index name''',
        require=True, validate=validators.Fieldname())

    pattern = Option(
        doc='''
        **Syntax:** **pattern=***<regular-expression>*
        **Description:** Regular expression pattern to match''',
        require=True, validate=validators.RegularExpression())

    def stream(self, records):
        self.logger.debug('CountMatchesCommand: %s', self)  # logs command line
        pattern = self.pattern
        outname = self.outname

        count = 0
        whitelist = ""

        for record in records:

            for fieldname in self.fieldnames:
                matches = pattern.findall(six.text_type(record[fieldname]))
                count += len(matches)
            record[self.fieldname] = count

            if whitelist != "":
                whitelist = str(whitelist) + "|" + str(record)
            else:
                whitelist = str(record)

        # whitelist is empty
        if count == 0:
            whitelist = "[('" + str(outname) + "', '*')]"

        yield {'_raw': str(whitelist)}
Пример #8
0
class fuzzylookup(StreamingCommand):
    doc = '''
	**Syntax:**
	| fuzzylookup 
		[ prefix=<string> ]
		[ lookupfilter=<kvpairs> ]
		[ mask=<regex> ]
		[ delete=<regex> ]
		<lookup-table-name> 
		( <lookup-field> [AS <event-field>] ) 
		[ OUTPUT | OUTPUTNEW (<lookup-destfield> [AS <event-destfield>] ) ... ]

	**Description**
	Takes field from search results and compares to a lookup for near-matches
	'''

    prefix = Option(doc='''
		**Syntax:** **prefix=***"prefix_text"
		**Description:** Text to prefix all output field names with. Helpful if you just want every lookup field without aliasing each one.''',
                    require=False)

    addmetrics = Option(doc='''
		**Syntax:** **addmetrics=***[True|False]
		**Description:** Add fuzzy match metrics to each result (score, matching characters, similarity score, consecutive match length)
		**Default:** False''',
                        require=False,
                        validate=validators.Boolean())

    lookupfilter = Option(doc='''
		**Syntax:** **lookupfilter=***"LookupField1=\"local admin\" Lookupfield2=\"*@$email_domain$\""* (wildcard, variable, or literal string match)
		**Description:** Filter for data in the specified lookup to narrow down comparisons''',
                          require=False)

    mask = Option(doc='''
		**Syntax:** **mask=***"regular expression"*
		**Description:** Mask pattern for both compared sets of values. Masks the regex matched text before comparing.''',
                  require=False,
                  validate=validators.RegularExpression())

    delete = Option(doc='''
		**Syntax:** **delete=***"regular expression"*
		**Description:** Deletion pattern for both compared sets of values. Removes the regex matched text before comparing.''',
                    require=False,
                    validate=validators.RegularExpression())

    session_key = ''
    splunkd_uri = ''
    service = ''
    lookup_list = []
    lookup_filters_static = []
    lookup_filters_dynamic = []

    lookup = ''
    lookupfield = ''
    searchfield = ''
    output_aliases = OrderedDict()
    # Default output field overwrite setting is True
    output_overwrite = True

    # Store the data from the lookup for each dynamic filter
    # Use a manager as a proxy to allow for cross-process communication
    manager = Manager()
    prepopulated_filter_lookupdata = manager.dict()
    l = manager.list()

    # Define main function
    def stream(self, events):
        logger = setup_logging('fuzzylookup')

        args = [
            val for val in self._metadata.searchinfo.args[2:] if '=' not in val
        ]

        logger.debug("Arguments: " + str(self._metadata.searchinfo.args[2:]))
        arg_count = len(args)
        arg_index = 0

        # Parse the arguments to the command
        if arg_count >= 3:
            while arg_index < arg_count:
                # Process the lookup name, lookup field, search field
                if self.lookup == '':
                    self.lookup = args[arg_index]
                    arg_index += 1
                if self.lookupfield == '':
                    self.lookupfield = args[arg_index]
                    if len(args) >= arg_index + 2:
                        if args[arg_index + 1].upper() == 'AS':
                            self.searchfield = args[arg_index + 2]
                            arg_index += 3
                        else:
                            self.searchfield = self.lookupfield
                            arg_index += 1
                    else:
                        self.searchfield = self.lookupfield
                        arg_index += 1

                if arg_index < len(args) and None not in [
                        self.lookup, self.lookupfield, self.searchfield
                ]:
                    if args[arg_index].upper() == 'OUTPUT':
                        self.output_overwrite = True
                    elif args[arg_index].upper() == 'OUTPUTNEW':
                        self.output_overwrite = False
                    else:
                        # Add field to output fields list
                        output_field_name = args[arg_index].strip(',')
                        if len(args) >= arg_index + 2:
                            if args[arg_index + 1].upper() == 'AS':
                                self.output_aliases[output_field_name] = args[
                                    arg_index + 2]
                                arg_index += 2
                            else:
                                self.output_aliases[
                                    output_field_name] = output_field_name
                        else:
                            self.output_aliases[
                                output_field_name] = output_field_name
                    arg_index += 1
        else:
            logger.critical(
                "Not enough parameters specified to execute fuzzylookup.")
            print("Not enough parameters specified to execute fuzzylookup.")
            exit(1957)

        if None in [self.lookup, self.lookupfield, self.searchfield]:
            logger.critical("Could not parse all arguments for fuzzylookup")
            print("Could not parse all arguments for fuzzylookup")
            exit(1173)

        logger.debug("lookup: " + self.lookup)
        logger.debug("lookupfield: " + self.lookupfield)
        logger.debug("searchfield: " + self.searchfield)
        logger.debug("output_overwrite: " + str(self.output_overwrite))
        logger.debug("output_aliases: " + str(self.output_aliases))

        if self.prefix is None:
            self.prefix = ''
        if self.addmetrics is None:
            self.addmetrics = False
        logger.debug("prefix = %s", self.prefix)
        logger.debug("addmetrics = %s", self.addmetrics)

        #log beginning of comparison
        logger.info('Comparing %s to %s in %s lookup for fuzzy matches',
                    self.searchfield, self.lookupfield, self.lookup)
        start_time = time.time()

        lookupfilter_str = ''
        # See if we have a lookup filter we can use in the root search
        if self.lookupfilter is not None and len(self.lookupfilter) > 0:
            # Split the filter into multiple key/value filters
            # Break the data into multiple fields, if needed
            # Replace the space delimiter with |, then split by |
            filter_list = re.sub(r'\s+(\w+=)', '|\g<1>',
                                 self.lookupfilter).split('|')  # pylint: disable=anomalous-backslash-in-string
            for f in filter_list:
                logger.debug("filter = " + f)
                filter_re = re.compile(r'^(.*?)([<>=]+)(.*)$')
                m = filter_re.match(f)
                if m is not None:
                    filter_obj = {
                        'field': m.group(1),
                        'op': m.group(2),
                        'value': m.group(3).strip('"')
                    }
                    # Find the dynamic filters, referencing $fieldname$ from the event
                    if re.search(r'\$\w+\$', f) is None:
                        self.lookup_filters_static.append(filter_obj)
                    else:
                        # Find the static filters
                        self.lookup_filters_dynamic.append(filter_obj)
                else:
                    # Only handle field/value pair filters. Ignore all others.
                    logger.info("Ignored filter: %s", f)

            # Build the static filter string to go into the SPL search
            for f in self.lookup_filters_static:
                lookupfilter_str += '{0}{1}"{2}" '.format(
                    f['field'].replace('|', ""), f['op'],
                    f['value'].replace('|', ""))

        logger.debug("Static lookup filter: %s", lookupfilter_str)

        if len(lookupfilter_str) > 0:
            lookup_search = '|inputlookup {0} where {1}="*" | search {2} | eval {1}=lower({1}) | dedup {1}'.format(
                self.lookup, self.lookupfield, lookupfilter_str)
        else:
            lookup_search = '|inputlookup {0} where {1}="*" | eval {1}=lower({1}) | dedup {1}'.format(
                self.lookup, self.lookupfield)

        logger.info('Lookup query is: %s' % (lookup_search))
        # Connect via existing session key
        self.session_key = self._metadata.searchinfo.session_key
        self.splunkd_uri = self._metadata.searchinfo.splunkd_uri
        namespace = self._metadata.searchinfo.app

        try:
            self.service = client.connect(token=self.session_key)
            logger.info('Successfully connected to %s', str(self.splunkd_uri))
        except BaseException as e:
            logger.error('Error connecting: %s', repr(e))
        # bind incoming search results for reading and extraction of search field
        # execute lookup command and bind results
        logger.info('Attempting to cache lookup of %s', self.lookup)

        # Set the URL of the Splunk endpoint
        search_url = '%s/servicesNS/nobody/%s/search/jobs' % (self.splunkd_uri,
                                                              namespace)

        # Set the headers for HTTP requests
        headers = {
            'Authorization': 'Splunk %s' % self.session_key,
            'Content-Type': 'application/x-www-form-urlencoded'
        }

        try:
            request_data = {
                "search": lookup_search,
                "exec_mode": 'oneshot',
                "count": '0',
                "rf": self.lookupfield,  # Required fields list
                "namespace": namespace,
                "output_mode": 'json'
            }
            #logger.debug('Request data: %s', str(request_data))
            logger.debug('Search URL: %s', str(search_url))
            #logger.debug('Headers: %s', str(headers))

            payload = str.encode(urllib.parse.urlencode(request_data))
            json_data, result_code = request('POST', search_url, payload,
                                             headers)

            # Write the values from the lookup to lookup_list
            self.lookup_list = json.loads(json_data)['results']

            logger.info('Retrieved %d records from lookup %s',
                        len(self.lookup_list), self.lookup)
            logger.debug('Response code: %s', result_code)
            #logger.debug('Response contents: %s', json_data)
        except BaseException as e:
            logger.error('Could not cache lookup %s: %s', self.lookup, repr(e))

        # Make a Pool of workers
        pool = ThreadPool(5)

        try:
            count = 0
            if len(self.lookup_list) > 0:
                logger.debug("Running ThreadPool")
                results = pool.map(self.get_distances, events)
                for result in results:
                    yield result
                    count += 1
            else:
                for event in events:
                    yield event
                    count += 1

        except BaseException as e:
            logger.error("Error: %s" % repr(e))
            results = {}

        duration_secs = round(time.time() - start_time)
        logger.info(
            "Completed fuzzylookup search command for %s results in %s seconds.",
            str(count), str(duration_secs))

    # Run this thread once for each event
    def get_distances(self, event):
        logger = setup_logging('fuzzylookup')
        start_time = time.time()

        # sf = search field / field from search results
        # Convert to Unicode (py3 compatible)
        event_field_value = str(event[self.searchfield].lower())
        if event_field_value is None or len(event_field_value) == 0:
            return event

        # Iterate through lookupfield results and calculate get_distances
        logger.debug('Calculating distances for %s', event_field_value)
        best_match_string = None
        active_score = 100
        active_charmatch = 0
        best_score = 100
        best_charmatch = 0
        dynamic_matches = 0
        dynamic_match_list = []
        dynamic_filters = {}
        use_cache = True

        try:
            # See if we have a dynamic lookup filter (references event field values)
            if len(self.lookup_filters_dynamic) > 0:
                # For this event, calculate the dynamic lookup filters based on the data in the event
                # Using this feature dramatically speeds up searches by limiting the number of rows compared
                dynamic_filter_keys = []
                for s in self.lookup_filters_dynamic:
                    try:
                        # Look for dynamic variables in the provided filter ($xxxxx$)
                        lookup_filter_value = s['value']
                        match_list = re.findall(r'\$[^\$]+\$',
                                                lookup_filter_value)

                        # For each match, replace instances of $xxxxxx$ with the field value from the event
                        # Supports multiple event fields
                        for group in match_list:
                            v = group.strip('$')
                            if v in list(event.keys()):
                                lookup_filter_value = lookup_filter_value.replace(
                                    group, event[v])

                        dynamic_filters[s['field']] = lookup_filter_value
                    except BaseException as e:
                        logger.error(
                            "Error building dynamic lookup filters: %s",
                            repr(e))
                    # We may have more than one filtered field per row. Account for that here.
                    dynamic_filter_keys.append(s['field'] + "=" +
                                               lookup_filter_value)

                # Generate the key string so we can recall the same lookup rows later for more events
                # This is to have a shorter list to compare against
                if len(dynamic_filter_keys) > 0:
                    #logger.debug(str(dynamic_filter_keys))
                    dynamic_filter_keys.sort()
                    dynamic_filters_key = '|'.join(dynamic_filter_keys)
                    #logger.debug("dynamic_filters_key = " + dynamic_filters_key)

                    if dynamic_filters_key in list(
                            self.prepopulated_filter_lookupdata.keys(
                            )) and use_cache:
                        logger.debug(
                            "Using prepopulated filter lookup data for " +
                            dynamic_filters_key)
                        comparison_list = self.prepopulated_filter_lookupdata[
                            dynamic_filters_key]
                        # Make sure we skip the filter comparison and go straight to Levenshtein
                        dynamic_filters = {}
                    else:
                        comparison_list = self.lookup_list
                        #logger.debug("Cached dynamic filter results: " + str(len(list(self.prepopulated_filter_lookupdata.keys()))))

                else:
                    logger.error("No dynamic filters matched for input: " +
                                 str(event))
                    return event
            else:
                # No dynamic filters found. Use the raw lookup list.
                comparison_list = self.lookup_list
                dynamic_filters_key = None

            # Find the shortest distance metric
            comparison_count = 0
            for lookup_record in comparison_list:
                comparison_count += 1

                # We have a dynamic filter so we have to grab the field referenced from the event
                # Ex: Lookupfield2=\"*@$email_domain$\""
                # s['field'] = 'Lookupfield2'
                # s['value'] = "*@$email_domain$"
                filter_matched = True
                for filter_key, filter_value in list(dynamic_filters.items()):
                    try:
                        #dynamic_filter_list.append(lookup_filter_value)
                        if filter_key in list(lookup_record.keys()):
                            # Make sure the dynamic filter field matches the dynamic filter value
                            # Prepare the text field to be compared against
                            lookup_value = lookup_record[filter_key]
                            # Use fnmatch to do a pure wildcard search between the lookup row value
                            #  and the dynamic filter text from the event
                            #logger.debug("Comparing %s to %s", lookup_value, lookup_filter_value)
                            if fnmatch.fnmatch(lookup_value, filter_value):
                                pass
                            else:
                                # If the record doesn't match, skip to the next lookup value (see below)
                                filter_matched = False
                        else:
                            logger.debug(
                                "Lookup record skipped. Missing field %s: %s",
                                s['field'], str(lookup_record))
                            filter_matched = False

                    except BaseException as e:
                        logger.error(
                            "Error checking dynamic lookup filters: %s",
                            repr(e))

                if filter_matched:
                    dynamic_matches += 1
                    # Use this match for caching lookup entries that match this dynamic filter
                    dynamic_match_list.append(lookup_record)
                else:
                    # Skip comparison
                    continue

                # Produce a list of fields to output if we were not supplied one
                if len(self.output_aliases) == 0:
                    for lookup_field in list(lookup_record.keys()):
                        self.output_aliases[lookup_field] = lookup_field

                # Get the lookup field value
                lookup_value = lookup_record[self.lookupfield]

                # Convert to Unicode (Python 3 compatible version)
                sf_compare = str(event_field_value.lower())
                lf_compare = str(lookup_value.lower())
                try:
                    # Apply the deletions and masking prior to comparisons being made
                    if self.delete is not None:
                        sf_compare = re.sub(self.delete, '', sf_compare)
                        lf_compare = re.sub(self.delete, '', lf_compare)
                    if self.mask is not None:
                        sf_compare = re.sub(self.mask, '*', sf_compare)
                        lf_compare = re.sub(self.mask, '*', lf_compare)

                    #logger.debug("Comparing %s to %s", sf_compare, lf_compare)
                    active_score = jf.levenshtein_distance(
                        sf_compare, lf_compare)
                    active_charmatch = matching_chars(sf_compare, lf_compare)

                    # Get the result with the greatest 1:1 character overlap if the scores are identical
                    if active_score < best_score or (
                            active_score == best_score
                            and active_charmatch > best_charmatch):
                        # New best score
                        best_match_string = [lookup_value]
                        best_match_lookup_record = [lookup_record]
                        best_score = active_score
                        best_charmatch = active_charmatch
                        best_lf_compare = lf_compare
                    elif active_score == best_score and active_charmatch == best_charmatch:
                        # Same best score, different entry. Append to the list.
                        best_match_string.append(lookup_value)
                        best_match_lookup_record.append(lookup_record)
                except TypeError as e:
                    logger.error("Type Error: " + repr(e))
                    raise Exception
                except BaseException as e:
                    logger.error("Error comparing %s to list entry %s: %s",
                                 event_field_value, lookup_value, repr(e))

            if best_score < 100:
                # Calculate a metric for similarity based on fuzzy score and string character overlap count
                fuzzy_weight = 75
                charmatch_weight = 25
                #sequencelen_weight = 25
                max_length = max(len(sf_compare), len(best_lf_compare))
                fuzzy_metric = round(
                    (1 - (float(best_score) / max_length)) * fuzzy_weight,
                    2)  # inverted, best=0
                charmatch_metric = round(
                    (float(best_charmatch) / max_length) * charmatch_weight, 2)
                #sequencelen_metric = round((1-(float(best_sequencelen) / max_length)) * sequencelen_weight, 2)

                # Check for the best consecutive character length match in the resulting list
                # This is done in a second step to limit the number of sequence length computations
                #if len(best_match_lookup_record) > 1:
                best_sequencelen = 0
                best_sequence_lookup_record = []
                for lookup_record in best_match_lookup_record:
                    lf_compare = lookup_record[self.lookupfield].lower()
                    # Apply the deletions and masking prior to comparisons being made (again)
                    if self.delete is not None:
                        lf_compare = re.sub(self.delete, '', lf_compare)
                    if self.mask is not None:
                        lf_compare = re.sub(self.mask, '*', lf_compare)

                    # Calculate the length of consecutive character matches
                    active_sequencelen = overlap_length(sf_compare, lf_compare)

                    if active_sequencelen > best_sequencelen:
                        # New best score
                        best_sequencelen = active_sequencelen
                        best_sequence_lookup_record = [lookup_record]

                    elif active_sequencelen == best_sequencelen:
                        # Best score tie
                        best_sequence_lookup_record.append(lookup_record)
                best_match_lookup_record = best_sequence_lookup_record

                if self.addmetrics:
                    # Output the fuzzy metrics
                    event[self.prefix + "fuzzy_matchlen"] = best_sequencelen
                    event[self.prefix + "fuzzy_score"] = best_score
                    event[self.prefix + "fuzzy_charmatch"] = best_charmatch
                    event[
                        self.prefix +
                        "fuzzy_similarity"] = fuzzy_metric + charmatch_metric  # + sequencelen_metric

                # Output the fields from the lookup entry/entries
                if len(self.output_aliases) > 0:
                    logger.debug('output_aliases length: ' +
                                 str(len(self.output_aliases)))
                    # Only write selected entries to the event. Aliases and field names are identical if no alias specified.
                    for lookup_field, lookup_field_alias in list(
                            self.output_aliases.items()):
                        #logger.debug(self.output_overwrite)
                        #logger.debug(event[lookup_field])
                        #logger.debug(lookup_record[lookup_field])
                        if (self.output_overwrite or lookup_field not in list(
                                event.keys())) and lookup_field in list(
                                    lookup_record.keys()):
                            # Loop through the "best matches" lookup entries
                            lookup_field_entries = []
                            for lookup_record in best_match_lookup_record:
                                lookup_field_entries.append(
                                    lookup_record[lookup_field])
                            event[self.prefix +
                                  lookup_field_alias] = lookup_field_entries

            # Cache the dynamic lookup list entries in case another event needs the same list
            # This dramatically speeds up processing for dynamic filters that match a large part of the lookup
            if dynamic_filters_key is not None and dynamic_match_list is not None and len(
                    list(dynamic_filters.keys())) > 0:
                self.prepopulated_filter_lookupdata[
                    dynamic_filters_key] = dynamic_match_list
                #logger.debug("prepopulated_filter_lookupdata count (child process) = " + str(len(list(self.prepopulated_filter_lookupdata.keys()))))

            duration_secs = round(time.time() - start_time)
            logger.debug(
                "Done calculating distances for %s in %s seconds. Result: %s",
                event_field_value, str(duration_secs), best_match_string)
            if dynamic_filters_key is not None:
                logger.debug("Dynamic filter matches for %s: %s",
                             dynamic_filters_key, dynamic_matches)
        except BaseException as e:
            logger.error("get_distances error: " + repr(e))
            tb = traceback.format_exc()
            logger.error(tb)
        return event
Пример #9
0
class SplunkRerunCommand(GeneratingCommand):

    regex = Option(require=True, validate=validators.RegularExpression(),
                    doc='''
                    **Syntax:** **regex=***<regex pattern>*
                    **Description:** regex pattern matching alerts/reports to rerun''')
    trigger = Option(validate=validators.Boolean(), default=False)

    #Todo - Update tz and epoch to be class veriables
    
    
    # Apply snap to earliest or latest
    # This would be the @<snap> period of pattern
    def applySnap(self, unit, original):
        tz = tzlocal.get_localzone()
        #Get epoch as Datetime based on timezone
        epoch = datetime.datetime.fromtimestamp(0,tz=tz)
        #Get runtime as Datetime based on timezone
        orig = datetime.datetime.fromtimestamp(int(original),tz=tz)
        #Depending on Snap replace units with 0 in Datetime
        #then convert back to epoch by subtracting epoch Datetime 
        #and returning difference in seconds 
        if(unit==None):
            return original
        if(unit=="m" or "min" in unit):
            x = (orig.replace(second=0)-epoch).total_seconds()
        elif(unit=="h" or "hr" in unit or "hour" in unit):
            x = (orig.replace(minute=0,second=0)-epoch).total_seconds()
        elif(unit=="d" or  "day" in unit):
            x = (orig.replace(hour=0,minute=0,second=0)-epoch).total_seconds()
        elif("mon" in unit):
            x = (orig.replace(day=1,hour=0,minute=0,second=0)-epoch).total_seconds()
        elif(unit=="y" or "yr" in unit or "year" in unit):
            x = (orig.replace(month=1,day=1,hour=0,minute=0,second=0)-epoch).total_seconds()
        elif("w" in unit):
            day = orig.weekday()
            x = ((orig-datetime.timedelta(days=day))-epoch).total_seconds()
        else:
            raise Exception("Error parsing snap unit; no match for unit")
        return x

    # Apply the offset to earliest and latest
    # This can be in 3 places in the pattern
    # <offset 1><offset 2>@<snap><snap offset>
    # -15m+5s@d+1h
    # function handles one offset at a time and does not matter which offset it is
    def applyOffset(self, offset, unit, original):
        # No need to convert to datetime in this function (exception month)
        # Just add or subtract the appropriate number of seconds
        if(offset==None or unit==None):
            return original
        if(unit=="s" or "sec" in unit):
            x = original + int(offset)
        elif(unit=="m" or "min" in unit):
            x = original + int(offset)*60
        elif(unit=="h" or "hr" in unit or "hour" in unit):
            x = original + int(offset)*60*60
        elif(unit=="d" or "day" in unit):
            x = original + int(offset)*60*60*24
        elif("w" in unit):
            x = original + int(offset)*60*60*24*7
        elif(unit=="y" or "yr" in unit or "year" in unit):
            x = original + int(offset)*60*60*24*7*365
        # Month is a special use case, since it is the only period that does not have a set number of seconds
        # To handle Month I convert to Datetime, and use relativedelta to add or subtract the number of months
        # then subtract epoch Datetime and get difference in seconds similar to applySnap
        elif("mon" in unit):
            try:
                tz = tzlocal.get_localzone()
                epoch = datetime.datetime.fromtimestamp(0,tz=tz)
                x = ((datetime.datetime.utcfromtimestamp(int(original),tz=tz)+relativedelta(months=int(offset)))-epoch).total_seconds()
            except NameError:
                raise Exception("No Month Functionality; install python-dateutil library")
            except Exception as e:
                raise e
        else:
            raise Exception("Error applying time offset; no match for unit")
        return x

    # Todo - better name for function; test more edge cases for possible earliest and latest patterns
    # getTimeRange will get earliest or latest based on the scheduled run time
    # relTime is the pattern for earliest or latest stored in Splunk
    # This could be as simple as -15m or @d or be complex as -1mon@y+12d
    def getTimeRange(self,relTime,runTime):
        #Regex to extract each offset and snap
        m = re.match("((?P<offset1>[+-]?\d+)(?P<unit1>[a-zA-Z]+)(?:(?P<offset2>[+-]\d+)(?P<unit2>[a-zA-Z]+))?)?(?:@(?P<snap>[a-zA-Z]+)(?:(?P<snapOff>[+-]\d+)(?P<snapUnit>[a-zA-Z]+))?)?",relTime)
        if relTime.isdigit():
            #If it is static time
            return relTime
        elif m and relTime!="now":
           #Apply snap then offsets in the following order: snap offset, first offset, second offset
           #The only time I think the order of offset would matter is when "mon" is used.
           self.logger.debug("[RERUN CMD]: Original: {0} {1}".format(runTime,relTime))
           runTime = self.applySnap(m.group('snap'),runTime)
           runTime = self.applyOffset(m.group('snapOff'),m.group('snapUnit'),runTime)
           runTime = self.applyOffset(m.group('offset1'),m.group('unit1'),runTime)
           runTime = self.applyOffset(m.group('offset2'),m.group('unit2'),runTime)
           self.logger.debug("[RERUN CMD]: Result: {}".format(runTime))
        return runTime
 
    def generate(self):
        #Todo - allow host to be set as paramater
        host = "localhost"
        #Get port info from uri in case using non-stnadard mgmt port
        splunkd_uri = self._metadata.searchinfo.splunkd_uri
        port = splunkd_uri.split(":")[-1]
        
        #Owner will be set as who ever ran the search
        owner = self._metadata.searchinfo.owner
        app= self._metadata.searchinfo.app
        
        #Get token to authenticate to API to rerun searches
        token=self._metadata.searchinfo.session_key
        
        #Use rerun command earliest and latest as the outage period, this way can be set by time picker instead of as parameters
        outageStart = self._metadata.searchinfo.earliest_time
        outageEnd = self._metadata.searchinfo.latest_time
        
        # Get the rerun command search id - this is because Splunk was not killing the python script when search was cancelled
        # Use this to monitor the status of the search and if it is no longer "Running" exit the script
        rerunSid = self._metadata.searchinfo.sid
        
        #Compile regex to find searches
        filter = re.compile(self.regex)
        
        #Try to connect to Splunk API
        self.logger.info("[RERUN CMD]: Connecting to Splunk API...")
        try:
            #service = client.connect(host=host, port=port, token=token, owner=owner, app=app)
            service = client.connect(host=host, port=port, token=token)
            self.logger.info("[RERUN CMD]: Connected to Splunk API successfully")
        except Exception as e:
            self.logger.error("[RERUN CMD]: {}".format(e.msg))
            
        #Splunk not stopping script going to ping sid from here and stop script if cancelled by user
        #Todo - look in to getting specific job info based on sid instead of use for statement
        for job in service.jobs:
            if job.sid == rerunSid:
                rerunJob = job
                self.logger.debug(job.state)        
        #If for some reason script cant find the search that triggered it 
        if not rerunJob:
            self.logger.error("[RERUN CMD]: Rerun Job SID not found exiting...")
            sys.exit(1)
        
        # Main loop to find an rerun searches
        for search in service.saved_searches:
           # Does not rerun disabled searches
           if filter.search(search.name) and search.is_scheduled=="1" and search.disabled=="0":
                #Parse the Splunk cron schedule for the found search
                ct = CronTab(search['content']['cron_schedule'])
                
                #Get earliest and latest pattern for search
                dispatch_earliest = search['content']['dispatch.earliest_time']
                dispatch_latest = search['content']['dispatch.latest_time']
                
                # Start with runTime equal to outageStart, crontab will be used to set this to the next time scheduled search
                # would have ran before rerunning 
                runTime=outageStart
                while True:
                    # Check to see if the search has been cancelled by user
                    rerunJob.refresh()
                    if rerunJob.state.content.dispatchState!="RUNNING":
                        sys.exit()
                    
                    # Get next scheduled run time, and break if greater than outageEnd
                    runTime = runTime + ct.next(now=runTime,default_utc=False)
                    if runTime > outageEnd or rerunJob.state.content.dispatchState!="RUNNING":
                        self.logger.error(rerunJob.state.content.dispatchState)
                        break
                    
                    #Get new earliest and latest based on new search run time
                    earliest = self.getTimeRange(dispatch_earliest,runTime)
                    latest = self.getTimeRange(dispatch_latest,runTime)
                    
                    # Set search parameters and run search
                    kwargs_block = {'dispatch.earliest_time':earliest, "dispatch.latest_time":latest, "trigger_actions":self.trigger}
                    job = search.dispatch(**kwargs_block)
                    time.sleep(0.25)
                    #Couldn't pass blocking argument, so sleep until isDone
                    while job['isDone']!="1":
                        self.logger.debug("[RERUN CMD]: Percent {}".format(job['doneProgress']))
                        time.sleep(1)
                        job.refresh()
                    message = "{} ran sucessfully for scheduled time {}".format(search.name,runTime)
                    self.logger.info("[RERUN CMD]: {}".format(message))
                    #Return results
                    yield {"_time":time.time(), "Message":message,"Search":search.name, "MissedRunTime":runTime, "MissedEarliest":earliest,"MissedLatest":latest, "TriggerActions":self.trigger,"Finished":job['isDone'],"CompletionPercentage":float(job['doneProgress'])*100,"ScanCount":job['scanCount'],"EventCount":job['eventCount'],"ResultCount": job['resultCount']}