def __init__(self): """Initialize the data fetcher.""" super().__init__() self._datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], )
def __init__(self): """Initialize the data finder.""" self._end_date = "" self._indices = [] self._parameters = {} self._rule = {} self._start_date = "" self._timeline_ids = [] self._datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], )
def __init__(self, sketch_id=None, indices=None, timeline_ids=None): """Initialize the aggregator object. Args: field: String that contains the field name used for URL generation. sketch_id: Sketch ID. indices: Optional list of OpenSearch index names. If not provided the default behavior is to include all the indices in a sketch. timeline_ids: Optional list of timeline IDs, if not provided the default behavior is to query all the data in the provided search indices. """ if not sketch_id and not indices: raise RuntimeError("Need at least sketch_id or index") self.opensearch = OpenSearchDataStore( host=current_app.config.get("OPENSEARCH_HOST"), port=current_app.config.get("OPENSEARCH_PORT"), ) self._sketch_url = "/sketch/{0:d}/explore".format(sketch_id) self.field = "" self.indices = indices self.sketch = SQLSketch.query.get(sketch_id) self.timeline_ids = None active_timelines = self.sketch.active_timelines if not self.indices: self.indices = [t.searchindex.index_name for t in active_timelines] if timeline_ids: valid_ids = [t.id for t in active_timelines] self.timeline_ids = [t for t in timeline_ids if t in valid_ids]
def datastore(self): """Property to get an instance of the datastore backend. Returns: Instance of lib.datastores.opensearch.OpenSearchDatastore """ return OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], )
def __init__(self, sketch=None, timeline_ids=None): """Initialize the graph object. Args: sketch (Sketch): Sketch object. timeline_ids (List[int]): An optional list of timeline IDs. Raises: KeyError if graph type specified is not supported. """ self.datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) if not GRAPH_TYPES.get(self.GRAPH_TYPE): raise KeyError(f"Graph type {self.GRAPH_TYPE} is not supported") self.graph = Graph(self.GRAPH_TYPE) self.sketch = sketch self.timeline_ids = timeline_ids
def __init__(self, index_name, sketch_id, timeline_id=None): """Initialize the analyzer object. Args: index_name: OpenSearch index name. sketch_id: Sketch ID. timeline_id: The timeline ID. """ self.name = self.NAME self.index_name = index_name self.sketch = Sketch(sketch_id=sketch_id) self.timeline_id = timeline_id self.timeline_name = "" self.tagged_events = {} self.emoji_events = {} self.datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) if not hasattr(self, "sketch"): self.sketch = None
def run_csv_jsonl( file_path, events, timeline_name, index_name, source_type, timeline_id ): """Create a Celery task for processing a CSV or JSONL file. Args: file_path: Path to the JSON or CSV file. events: A string with the events. timeline_name: Name of the Timesketch timeline. index_name: Name of the datastore index. source_type: Type of file, csv or jsonl. timeline_id: ID of the timeline object this data belongs to. Returns: Name (str) of the index. """ if events: file_handle = io.StringIO(events) source_type = "jsonl" else: file_handle = codecs.open(file_path, "r", encoding="utf-8", errors="replace") event_type = "generic_event" # Document type for OpenSearch validators = { "csv": read_and_validate_csv, "jsonl": read_and_validate_jsonl, } read_and_validate = validators.get(source_type) # Log information to Celery logger.info( "Index timeline [{0:s}] to index [{1:s}] (source: {2:s})".format( timeline_name, index_name, source_type ) ) mappings = None mappings_file_path = current_app.config.get("GENERIC_MAPPING_FILE", "") if os.path.isfile(mappings_file_path): try: with open(mappings_file_path, "r") as mfh: mappings = json.load(mfh) if not isinstance(mappings, dict): raise RuntimeError( "Unable to create mappings, the mappings are not a " "dict, please look at the file: {0:s}".format( mappings_file_path ) ) except (json.JSONDecodeError, IOError): logger.error("Unable to read in mapping", exc_info=True) opensearch = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) # Reason for the broad exception catch is that we want to capture # all possible errors and exit the task. final_counter = 0 error_msg = "" error_count = 0 try: opensearch.create_index( index_name=index_name, doc_type=event_type, mappings=mappings ) for event in read_and_validate(file_handle): opensearch.import_event( index_name, event_type, event, timeline_id=timeline_id ) final_counter += 1 # Import the remaining events results = opensearch.flush_queued_events() error_container = results.get("error_container", {}) error_msg = get_import_errors( error_container=error_container, index_name=index_name, total_count=results.get("total_events", 0), ) except errors.DataIngestionError as e: _set_timeline_status(timeline_id, status="fail", error_msg=str(e)) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) raise except (RuntimeError, ImportError, NameError, UnboundLocalError, RequestError) as e: _set_timeline_status(timeline_id, status="fail", error_msg=str(e)) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) raise except Exception as e: # pylint: disable=broad-except # Mark the searchindex and timelines as failed and exit the task error_msg = traceback.format_exc() _set_timeline_status(timeline_id, status="fail", error_msg=error_msg) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) logger.error("Error: {0!s}\n{1:s}".format(e, error_msg)) return None if error_count: logger.info( "Index timeline: [{0:s}] to index [{1:s}] - {2:d} out of {3:d} " "events imported (in total {4:d} errors were discovered) ".format( timeline_name, index_name, (final_counter - error_count), final_counter, error_count, ) ) else: logger.info( "Index timeline: [{0:s}] to index [{1:s}] - {2:d} " "events imported.".format(timeline_name, index_name, final_counter) ) # Set status to ready when done _set_timeline_status(timeline_id, status="ready", error_msg=error_msg) return index_name
def run_plaso(file_path, events, timeline_name, index_name, source_type, timeline_id): """Create a Celery task for processing Plaso storage file. Args: file_path: Path to the plaso file on disk. events: String with event data, invalid for plaso files. timeline_name: Name of the Timesketch timeline. index_name: Name of the datastore index. source_type: Type of file, csv or jsonl. timeline_id: ID of the timeline object this data belongs to. Raises: RuntimeError: If the function is called using events, plaso is not installed or is of unsupported version. Returns: Name (str) of the index. """ if not plaso: raise RuntimeError( "Plaso isn't installed, unable to continue processing plaso " "files." ) plaso_version = int(plaso.__version__) if plaso_version <= PLASO_MINIMUM_VERSION: raise RuntimeError( "Plaso version is out of date (version {0:d}, please upgrade to a " "version that is later than {1:d}".format( plaso_version, PLASO_MINIMUM_VERSION ) ) if events: raise RuntimeError("Plaso uploads needs a file, not events.") event_type = "generic_event" # Document type for OpenSearch mappings = None mappings_file_path = current_app.config.get("PLASO_MAPPING_FILE", "") if os.path.isfile(mappings_file_path): try: with open(mappings_file_path, "r") as mfh: mappings = json.load(mfh) if not isinstance(mappings, dict): raise RuntimeError( "Unable to create mappings, the mappings are not a " "dict, please look at the file: {0:s}".format( mappings_file_path ) ) except (json.JSONDecodeError, IOError): logger.error("Unable to read in mapping", exc_info=True) opensearch_server = current_app.config.get("OPENSEARCH_HOST") if not opensearch_server: raise RuntimeError( "Unable to connect to OpenSearch, no server set, unable to " "process plaso file." ) opensearch_port = current_app.config.get("OPENSEARCH_PORT") if not opensearch_port: raise RuntimeError( "Unable to connect to OpenSearch, no port set, unable to " "process plaso file." ) opensearch = OpenSearchDataStore(host=opensearch_server, port=opensearch_port) try: opensearch.create_index( index_name=index_name, doc_type=event_type, mappings=mappings ) except errors.DataIngestionError as e: _set_timeline_status(timeline_id, status="fail", error_msg=str(e)) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) raise except (RuntimeError, ImportError, NameError, UnboundLocalError, RequestError) as e: _set_timeline_status(timeline_id, status="fail", error_msg=str(e)) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) raise except Exception as e: # pylint: disable=broad-except # Mark the searchindex and timelines as failed and exit the task error_msg = traceback.format_exc() _set_timeline_status(timeline_id, status="fail", error_msg=error_msg) logger.error("Error: {0!s}\n{1:s}".format(e, error_msg)) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) return None message = "Index timeline [{0:s}] to index [{1:s}] (source: {2:s})" logger.info(message.format(timeline_name, index_name, source_type)) try: psort_path = current_app.config["PSORT_PATH"] except KeyError: psort_path = "psort.py" cmd = [ psort_path, "-o", "elastic_ts", file_path, "--server", opensearch_server, "--port", str(opensearch_port), "--status_view", "none", "--index_name", index_name, ] if mappings_file_path: cmd.extend(["--elastic_mappings", mappings_file_path]) if timeline_id: cmd.extend(["--timeline_identifier", str(timeline_id)]) opensearch_username = current_app.config.get("OPENSEARCH_USER", "") if opensearch_username: cmd.extend(["--elastic_user", opensearch_username]) opensearch_password = current_app.config.get("OPENSEARCH_PASSWORD", "") if opensearch_password: cmd.extend(["--elastic_password", opensearch_password]) opensearch_ssl = current_app.config.get("OPENSEARCH_SSL", False) if opensearch_ssl: cmd.extend(["--use_ssl"]) psort_memory = current_app.config.get("PLASO_UPPER_MEMORY_LIMIT", None) if psort_memory is not None: cmd.extend(["--process_memory_limit", str(psort_memory)]) # Run psort.py try: subprocess.check_output(cmd, stderr=subprocess.STDOUT, encoding="utf-8") except subprocess.CalledProcessError as e: # Mark the searchindex and timelines as failed and exit the task _set_timeline_status(timeline_id, status="fail", error_msg=e.output) _close_index( index_name=index_name, data_store=opensearch, timeline_id=timeline_id ) return e.output # Mark the searchindex and timelines as ready _set_timeline_status(timeline_id, status="ready") return index_name
class ApiDataFetcher(interface.DataFetcher): """Data Fetcher for an API story exporter.""" def __init__(self): """Initialize the data fetcher.""" super().__init__() self._datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) def get_aggregation(self, agg_dict): """Returns an aggregation object from an aggregation dict. Args: agg_dict (dict): a dictionary containing information about the stored aggregation. Returns: A dict with metadata information as well as the aggregation object (instance of AggregationResult) from a saved aggregation or an empty dict if not found. """ aggregation_id = agg_dict.get("id") if not aggregation_id: return {} aggregation = Aggregation.query.get(aggregation_id) if not aggregation: return {} try: agg_class = aggregator_manager.AggregatorManager.get_aggregator( aggregation.agg_type) except KeyError: return {} if not agg_class: return pd.DataFrame() parameter_string = aggregation.parameters parameters = json.loads(parameter_string) parameter_index = parameters.pop("index", None) indices, timeline_ids = self.get_indices_and_timelines(parameter_index) aggregator = agg_class(sketch_id=self._sketch_id, indices=indices, timeline_ids=timeline_ids) _ = parameters.pop("supported_charts", None) chart_color = parameters.pop("chart_color", "N/A") chart_title = parameters.pop("chart_title", "N/A") data = { "aggregation": aggregator.run(**parameters), "name": aggregation.name, "description": aggregation.description, "agg_type": aggregation.agg_type, "parameters": parameters, "chart_type": aggregation.chart_type, "chart_title": chart_title, "chart_color": chart_color, "user": aggregation.user, } return data def get_aggregation_group(self, agg_dict): """Returns an aggregation object from an aggregation dict. Args: agg_dict (dict): a dictionary containing information about the stored aggregation. Returns: A dict that contains metadata about the aggregation group as well as a chart object (instance of altair.Chart) with the combined chart object from the group. """ group_id = agg_dict.get("id") if not group_id: return None group = AggregationGroup.query.get(group_id) if not group: return None orientation = group.orientation result_chart = None for aggregator in group.aggregations: if aggregator.parameters: aggregator_parameters = json.loads(aggregator.parameters) else: aggregator_parameters = {} agg_class = aggregator_manager.AggregatorManager.get_aggregator( aggregator.agg_type) if not agg_class: continue parameter_index = aggregator_parameters.pop("index", None) indices, timeline_ids = self.get_indices_and_timelines( parameter_index) aggregator_obj = agg_class(sketch_id=self._sketch_id, indices=indices, timeline_ids=timeline_ids) chart_type = aggregator_parameters.pop("supported_charts", None) color = aggregator_parameters.pop("chart_color", "") chart_title = aggregator_parameters.pop("chart_title", None) result_obj = aggregator_obj.run(**aggregator_parameters) title = chart_title or aggregator_obj.chart_title chart = result_obj.to_chart( chart_name=chart_type, chart_title=title, as_chart=True, interactive=True, color=color, ) if result_chart is None: result_chart = chart elif orientation == "horizontal": result_chart = alt.hconcat(chart, result_chart) elif orientation == "vertical": result_chart = alt.vconcat(chart, result_chart) else: result_chart = alt.layer(chart, result_chart) data = { "name": group.name, "description": group.description, "chart": result_chart, "parameters": group.parameters, "orientation": group.orientation, "user": group.user, } return data def get_indices_and_timelines(self, index_list): """Returns a tuple with two lists from indices and timeline IDs. Args: index_list (list): A list of timeline IDs (int) and indices (str). Returns: A tuple with two items, a list of indices and a list of timeline IDs. """ indices = [] timeline_ids = [] if isinstance(index_list, str): index_list = index_list.split(',') for index in index_list: if isinstance(index, str): indices.append(index) if isinstance(index, int): timeline_ids.append(index) return indices, timeline_ids def get_view(self, view_dict): """Returns a data frame from a view dict. Args: view_dict (dict): a dictionary containing information about the stored view. Returns: A pandas DataFrame with the results from a view aggregation. """ view_id = view_dict.get("id") if not view_id: return pd.DataFrame() view = View.query.get(view_id) if not view: return pd.DataFrame() if not view.query_string and not view.query_dsl: return pd.DataFrame() query_filter = view.query_filter if query_filter and isinstance(query_filter, str): query_filter = json.loads(query_filter) elif not query_filter: query_filter = {"indices": "_all", "size": 100} if view.query_dsl: query_dsl = json.loads(view.query_dsl) else: query_dsl = None sketch = Sketch.query.get_with_acl(self._sketch_id) sketch_indices = [ t.searchindex.index_name for t in sketch.active_timelines ] results = self._datastore.search_stream( sketch_id=self._sketch_id, query_string=view.query_string, query_filter=query_filter, query_dsl=query_dsl, indices=sketch_indices, ) result_list = [x.get("_source") for x in results] return pd.DataFrame(result_list)
class BaseGraphPlugin: """Base class for a graph. Attributes: datastore (OpenSearchDataStore): OpenSearch datastore object. graph (nx.Graph): NetworkX Graph object. """ # Name that the graph will be registered as. NAME = "name" # Display name (used in the UI) DISPLAY_NAME = "display_name" # Description of the plugin (used in the UI) DESCRIPTION = "description" # Type of graph. There are four supported types: Undirected Graph, # Undirected Multi Graph, Directed Graph, Directed Multi Graph. # If you have multiple edges between nodes you need to use the multi graphs. # # See NetworkX documentation for details: # https://networkx.org/documentation/stable/reference/classes/index.html GRAPH_TYPE = "MultiDiGraph" def __init__(self, sketch=None, timeline_ids=None): """Initialize the graph object. Args: sketch (Sketch): Sketch object. timeline_ids (List[int]): An optional list of timeline IDs. Raises: KeyError if graph type specified is not supported. """ self.datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) if not GRAPH_TYPES.get(self.GRAPH_TYPE): raise KeyError(f"Graph type {self.GRAPH_TYPE} is not supported") self.graph = Graph(self.GRAPH_TYPE) self.sketch = sketch self.timeline_ids = timeline_ids def _get_sketch_indices(self): """List all indices in the Sketch, or those that belong to a timeline. Returns: List of index names. """ active_timelines = self.sketch.active_timelines if self.timeline_ids: indices = [ t.searchindex.index_name for t in active_timelines if t.id in self.timeline_ids ] else: indices = [t.searchindex.index_name for t in active_timelines] return indices # TODO: Refactor this to reuse across analyzers and graphs. def event_stream( self, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None, scroll=True, ): """Search OpenSearch. Args: query_string: Query string. query_filter: Dictionary containing filters to apply. query_dsl: Dictionary containing OpenSearch DSL query. indices: List of indices to query. return_fields: List of fields to return. scroll: Boolean determining whether we support scrolling searches or not. Defaults to True. Returns: Generator of Event objects. Raises: ValueError: if neither query_string or query_dsl is provided. """ if not (query_string or query_dsl): raise ValueError("Both query_string and query_dsl are missing") # Query all sketch indices if none are specified. if not indices: indices = self._get_sketch_indices() if not query_filter: query_filter = {} return_fields = list(set(return_fields)) if self.timeline_ids: timeline_ids = self.timeline_ids else: timeline_ids = None event_generator = self.datastore.search_stream( query_string=query_string, query_filter=query_filter, query_dsl=query_dsl, indices=indices, return_fields=return_fields, timeline_ids=timeline_ids, enable_scroll=scroll, ) return event_generator def generate(self): """Entry point for the graph.""" raise NotImplementedError
class DataFinder: """The data finder class.""" def __init__(self): """Initialize the data finder.""" self._end_date = "" self._indices = [] self._parameters = {} self._rule = {} self._start_date = "" self._timeline_ids = [] self._datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) def can_run(self): """Returns a boolean whether the data finder can be run.""" if not self._rule: logger.warning( "Unable to run data finder since no rule has been defined.") return False if not self._start_date: logger.warning( "Unable to run data finder since no start date has been " "defined.") return False if not self._end_date: logger.warning( "Unable to run data finder since no end date has been " "defined.") return False if not self._parameters: return True re_parameters = self._rule.get("re_parameters", []) for parameter in re_parameters: if not parameter in self._parameters: logger.warning( "Parameters are defined, but parameter: [{0:s}] does not " "exist in parameter definitions for the rule.".format( parameter)) return False return True def set_end_date(self, end_date): """Sets the end date of the time period the data finder uses.""" # TODO: Implement a check if this is a valid ISO formatted date. self._end_date = end_date def set_indices(self, indices): """Sets the value of the indices.""" self._indices = indices def set_parameter(self, parameter, value): """Sets the value of a single parameter. Args: parameter (str): The string value of the parameter name. value (any): The value of the parameter. """ self._parameters[parameter] = value def set_parameters(self, parameter_dict): """Set multiple parameters at once using a dict. Args: parameter_dict (dict): A set of parameters and their values. """ if isinstance(parameter_dict, dict): self._parameters.update(parameter_dict) def set_rule(self, rule_dict): """Sets the rules of the data finder. Args: rule_dict (dict): A dict with the parameters for the data finder to operate, this include search parameters, regular expression, etc. """ self._rule = rule_dict def set_start_date(self, start_date): """Sets the start date of the time period the data finder uses.""" # TODO: Implement a check if this is a valid ISO formatted date. self._start_date = start_date def set_timeline_ids(self, timeline_ids): """Sets the timeline identifiers.""" self._timeline_ids = timeline_ids def find_data(self): """Returns a tuple with a bool on whether data was found and a message. Raises: RuntimeError: If the data finder cannot run. Returns: A tuple with two entries: bool: whether data was discovered or not. str: a message string indicating how the data was found or the the reason why it wasn't. """ if not self.can_run(): return False, "Unable to run the data finder, missing information." query_string = self._rule.get("query_string") query_dsl = self._rule.get("query_dsl") if not query_string and not query_dsl: raise RuntimeError( "Unable to run, missing either a query string or a DSL to " "perform the search.") attribute = self._rule.get("attribute") regular_expression = self._rule.get("regular_expression") if regular_expression: if not attribute: raise RuntimeError( "Attribute must be set in a rule if a regular expression " "is used.") expression = utils.compile_regular_expression( expression_string=regular_expression, expression_flags=self._rule.get("re_flags"), expression_parameters=self._rule.get("re_parameters"), ) else: expression = None query_filter = { "chips": [{ "field": "", "type": "datetime_range", "operator": "must", "active": True, "value": f"{self._start_date},{self._end_date}", }] } event_generator = self._datastore.search_stream( query_string=query_string, query_dsl=query_dsl, query_filter=query_filter, indices=self._indices, return_fields=attribute, enable_scroll=True, timeline_ids=self._timeline_ids, ) for event in event_generator: # TODO: Save the result to the Investigation object when that # exist in the future. if not expression: return True, "Data discovered" source = event.get("_source", {}) value = source.get(attribute) if not value: logger.warning("Attribute: [{0:s}] is empty".format(attribute)) result = expression.findall(value) if not result: continue return True, "Data discovered using Regular Expression" return False, "No hits discovered"
class BaseAnalyzer: """Base class for analyzers. Attributes: name: Analyzer name. index_name: Name of OpenSearch index. datastore: OpenSearch datastore client. sketch: Instance of Sketch object. timeline_id: The ID of the timeline the analyzer runs on. tagged_events: Dict with all events to add tags and those tags. emoji_events: Dict with all events to add emojis and those emojis. """ NAME = "name" DISPLAY_NAME = None DESCRIPTION = None # If this analyzer depends on another analyzer # it needs to be included in this frozenset by using # the indexer names. DEPENDENCIES = frozenset() # Used as hints to the frontend UI in order to render input forms. FORM_FIELDS = [] # Configure how long an analyzer should run before the timeline # gets fully indexed. SECONDS_PER_WAIT = 10 MAXIMUM_WAITS = 360 def __init__(self, index_name, sketch_id, timeline_id=None): """Initialize the analyzer object. Args: index_name: OpenSearch index name. sketch_id: Sketch ID. timeline_id: The timeline ID. """ self.name = self.NAME self.index_name = index_name self.sketch = Sketch(sketch_id=sketch_id) self.timeline_id = timeline_id self.timeline_name = "" self.tagged_events = {} self.emoji_events = {} self.datastore = OpenSearchDataStore( host=current_app.config["OPENSEARCH_HOST"], port=current_app.config["OPENSEARCH_PORT"], ) if not hasattr(self, "sketch"): self.sketch = None def event_pandas( self, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None, ): """Search OpenSearch. Args: query_string: Query string. query_filter: Dictionary containing filters to apply. query_dsl: Dictionary containing OpenSearch DSL query. indices: List of indices to query. return_fields: List of fields to be included in the search results, if not included all fields will be included in the results. Returns: A python pandas object with all the events. Raises: ValueError: if neither query_string or query_dsl is provided. """ if not (query_string or query_dsl): raise ValueError("Both query_string and query_dsl are missing") if not query_filter: query_filter = {"indices": self.index_name, "size": 10000} if not indices: indices = [self.index_name] if self.timeline_id: timeline_ids = [self.timeline_id] else: timeline_ids = None # Refresh the index to make sure it is searchable. for index in indices: try: self.datastore.client.indices.refresh(index=index) except opensearchpy.NotFoundError: logger.error("Unable to refresh index: {0:s}, not found, " "removing from list.".format(index)) broken_index = indices.index(index) _ = indices.pop(broken_index) if not indices: raise ValueError("Unable to get events, no indices to query.") if return_fields: default_fields = definitions.DEFAULT_SOURCE_FIELDS return_fields.extend(default_fields) return_fields = list(set(return_fields)) return_fields = ",".join(return_fields) results = self.datastore.search_stream( sketch_id=self.sketch.id, query_string=query_string, query_filter=query_filter, query_dsl=query_dsl, indices=indices, timeline_ids=timeline_ids, return_fields=return_fields, ) events = [] for event in results: source = event.get("_source") source["_id"] = event.get("_id") source["_type"] = event.get("_type") source["_index"] = event.get("_index") events.append(source) return pandas.DataFrame(events) def event_stream( self, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None, scroll=True, ): """Search OpenSearch. Args: query_string: Query string. query_filter: Dictionary containing filters to apply. query_dsl: Dictionary containing OpenSearch DSL query. indices: List of indices to query. return_fields: List of fields to return. scroll: Boolean determining whether we support scrolling searches or not. Defaults to True. Returns: Generator of Event objects. Raises: ValueError: if neither query_string or query_dsl is provided. """ if not (query_string or query_dsl): raise ValueError("Both query_string and query_dsl are missing") if not query_filter: query_filter = {"indices": self.index_name} # If not provided we default to the message field as this will always # be present. if not return_fields: return_fields = ["message"] # Make sure we always return tag, human_readable and emoji attributes. return_fields.extend(["tag", "human_readable", "__ts_emojis"]) return_fields = list(set(return_fields)) if not indices: indices = [self.index_name] # Refresh the index to make sure it is searchable. for index in indices: try: self.datastore.client.indices.refresh(index=index) except opensearchpy.NotFoundError: logger.error("Unable to find index: {0:s}, removing from " "result set.".format(index)) broken_index = indices.index(index) _ = indices.pop(broken_index) if not indices: raise ValueError( "Unable to query for analyzers, discovered no index to query.") if self.timeline_id: timeline_ids = [self.timeline_id] else: timeline_ids = None # Exponential backoff for the call to OpenSearch. Sometimes the # cluster can be a bit overloaded and timeout on requests. We want to # retry a few times in order to give the cluster a chance to return # results. backoff_in_seconds = 3 retries = 5 for x in range(0, retries): try: event_generator = self.datastore.search_stream( query_string=query_string, query_filter=query_filter, query_dsl=query_dsl, indices=indices, return_fields=return_fields, enable_scroll=scroll, timeline_ids=timeline_ids, ) for event in event_generator: yield Event(event, self.datastore, sketch=self.sketch, analyzer=self) break # Query was succesful except opensearchpy.TransportError as e: sleep_seconds = backoff_in_seconds * 2**x + random.uniform( 3, 7) logger.info( "Attempt: {0:d}/{1:d} sleeping {2:f} for query {3:s}". format(x + 1, retries, sleep_seconds, query_string)) time.sleep(sleep_seconds) if x == retries - 1: logger.error( "Timeout executing search for {0:s}: {1!s}".format( query_string, e), exc_info=True, ) raise @_flush_datastore_decorator def run_wrapper(self, analysis_id): """A wrapper method to run the analyzer. This method is decorated to flush the bulk insert operation on the datastore. This makes sure that all events are indexed at exit. Returns: Return value of the run method. """ analysis = Analysis.query.get(analysis_id) analysis.set_status("STARTED") timeline = analysis.timeline self.timeline_name = timeline.name searchindex = timeline.searchindex counter = 0 while True: status = searchindex.get_status.status status = status.lower() if status == "ready": break if status == "fail": logger.error( "Unable to run analyzer on a failed index ({0:s})".format( searchindex.index_name)) return "Failed" time.sleep(self.SECONDS_PER_WAIT) counter += 1 if counter >= self.MAXIMUM_WAITS: logger.error( "Indexing has taken too long time, aborting run of " "analyzer") return "Failed" # Refresh the searchindex object. db_session.refresh(searchindex) # Run the analyzer. Broad Exception catch to catch any error and store # the error in the DB for display in the UI. try: result = self.run() analysis.set_status("DONE") except Exception: # pylint: disable=broad-except analysis.set_status("ERROR") result = traceback.format_exc() # Update database analysis object with result and status analysis.result = "{0:s}".format(result) db_session.add(analysis) db_session.commit() return result @classmethod def get_kwargs(cls): """Get keyword arguments needed to instantiate the class. Every analyzer gets the index_name as its first argument from Celery. By default this is the only argument. If your analyzer need more arguments you can override this method and return as a dictionary. If you want more than one instance to be created for your analyzer you can return a list of dictionaries with kwargs and each one will be instantiated and registered in Celery. This is neat if you want to run your analyzer with different arguments in parallel. Returns: List of keyword argument dicts or empty list if no extra arguments are needed. """ return [] def run(self): """Entry point for the analyzer.""" raise NotImplementedError