def test_get_events_from_data_frame(self): """Test getting all events from data frame.""" lines = [ { '_id': '123', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'isskeid' }, { '_id': '124', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'tong' }, { '_id': '125', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'klemma' }, ] frame = pd.DataFrame(lines) events = list(utils.get_events_from_data_frame(frame, None)) self.assertEqual(len(events), 3) ids = [x.event_id for x in events] self.assertEqual(set(ids), set(['123', '124', '125']))
def run(self): """Entry point for the analyzer. Returns: String with summary of the analyzer result """ # TODO: Once we can identify user generated events this should be # updated to include all user generated events instead of focusing # solely on browser events. query = 'source_short:"WEBHIST" OR source:"WEBHIST"' return_fields = ['timestamp', 'url', 'tag', '__ts_emojis'] data_frame = self.event_pandas( query_string=query, return_fields=return_fields) if not data_frame.shape[0]: return 'No browser events discovered.' sleeping_emoji = emojis.get_emoji('SLEEPING_FACE') # This query filters out all timestamps that have a zero timestamp as # well as those that occure after 2038-01-01, this may need to be # changed in the future. data_frame = data_frame[ (data_frame.timestamp > 0) & ( data_frame.timestamp < 2145916800000000)] data_frame['timestamp'] = pd.to_numeric(data_frame.timestamp) data_frame['datetime'] = pd.to_datetime( data_frame.timestamp / 1e6, utc=True, unit='s') data_frame['hour'] = pd.to_numeric( data_frame.datetime.dt.strftime('%H')) total_count = data_frame.shape[0] activity_hours, threshold, aggregation = get_active_hours(data_frame) if not activity_hours: return 'Did not discover any activity hours.' hour_count = dict(aggregation.values.tolist()) data_frame_outside = data_frame[~data_frame.hour.isin(activity_hours)] for event in utils.get_events_from_data_frame( data_frame_outside, self.datastore): event.add_tags(['outside-active-hours']) hour = event.source.get('hour') this_hour_count = hour_count.get(hour) event.add_attributes( {'activity_summary': ( 'Number of events for this hour ({0:d}): {1:d}, with the ' 'threshold value: {2:0.2f}').format( hour, this_hour_count, threshold), 'hour_count': this_hour_count}) event.add_emojis([sleeping_emoji]) event.commit() return ( 'Tagged {0:d} out of {1:d} events as outside of normal ' 'active hours.').format(data_frame_outside.shape[0], total_count)
def test_get_events_from_data_frame(self): """Test getting all events from data frame.""" lines = [ {"_id": "123", "_type": "manual", "_index": "asdfasdf", "tool": "isskeid"}, {"_id": "124", "_type": "manual", "_index": "asdfasdf", "tool": "tong"}, {"_id": "125", "_type": "manual", "_index": "asdfasdf", "tool": "klemma"}, ] frame = pd.DataFrame(lines) events = list(utils.get_events_from_data_frame(frame, None)) self.assertEqual(len(events), 3) ids = [x.event_id for x in events] self.assertEqual(set(ids), set(["123", "124", "125"]))
def test_get_events_from_data_frame(self): """Test getting all events from data frame.""" lines = [ {'_id': '123', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'isskeid'}, {'_id': '124', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'tong'}, {'_id': '125', '_type': 'manual', '_index': 'asdfasdf', 'tool': 'klemma'}, ] frame = pd.DataFrame(lines) events = list(utils.get_events_from_data_frame(frame, None)) self.assertEqual(len(events), 3) ids = [x.event_id for x in events] self.assertEqual(set(ids), set(['123', '124', '125']))
def run(self): """Entry point for the analyzer. Returns: String with summary of the analyzer result """ # TODO: Once we can identify user generated events this should be # updated to include all user generated events instead of focusing # solely on browser events. query = 'source_short:"WEBHIST" OR source:"WEBHIST"' return_fields = ['datetime', 'timestamp', 'url', 'tag', '__ts_emojis'] data_frame = self.event_pandas(query_string=query, return_fields=return_fields) if not data_frame.shape[0]: return 'No browser events discovered.' sleeping_emoji = emojis.get_emoji('SLEEPING_FACE') # This query filters out all timestamps that have a zero timestamp as # well as those that occur after 2038-01-01, this may need to be # changed in the future. data_frame['timestamp'] = pd.to_numeric(data_frame.timestamp) data_frame = data_frame[(data_frame.timestamp > 0) & (data_frame.timestamp < 2145916800000000)] data_frame['datetime'] = pd.to_datetime(data_frame.timestamp / 1e6, utc=True, unit='s') data_frame['hour'] = pd.to_numeric( data_frame.datetime.dt.strftime('%H')) total_count = data_frame.shape[0] activity_hours, threshold, aggregation = get_active_hours(data_frame) if not activity_hours: return 'Did not discover any activity hours.' hour_count = dict(aggregation.values.tolist()) data_frame_outside = data_frame[~data_frame.hour.isin(activity_hours)] for event in utils.get_events_from_data_frame(data_frame_outside, self.datastore): event.add_tags(['outside-active-hours']) hour = event.source.get('hour') this_hour_count = hour_count.get(hour) event.add_attributes({ 'activity_summary': ('Number of events for this hour ({0:d}): {1:d}, with the ' 'threshold value: {2:0.2f}').format(hour, this_hour_count, threshold), 'hour_count': this_hour_count }) event.add_emojis([sleeping_emoji]) event.commit() tagged_events, _ = data_frame_outside.shape if tagged_events: story = self.sketch.add_story('{0:s} - {1:s}'.format( utils.BROWSER_STORY_TITLE, self.timeline_name)) story.add_text(utils.BROWSER_STORY_HEADER, skip_if_exists=True) # Find some statistics about the run time of the analyzer. percent = (tagged_events / total_count) * 100.0 last_hour = activity_hours[0] end = 0 for hour in activity_hours[1:]: if hour != last_hour + 1: end = hour break last_hour = hour if not end: first = activity_hours[0] last = activity_hours[-1] else: first = end index = activity_hours.index(end) last = activity_hours[index - 1] story.add_text( '## Browser Timeframe Analyzer\n\nThe browser timeframe ' 'analyzer discovered {0:d} browser events that occurred ' 'outside of the typical browsing window of this browser ' 'history ({1:s}), or around {2:0.2f}% of the {3:d} total ' 'events.\n\nThe analyzer determines the activity hours by ' 'finding the frequency of browsing events per hour, and then ' 'discovering the longest block of most active hours before ' 'proceeding with flagging all events outside of that time ' 'period. This information can be used by other analyzers ' 'or by manually looking for other activity within the ' 'inactive time period to find unusual actions.\n\n' 'The hours considered to be active hours are the hours ' 'between {4:02d} and {5:02d} (hours in UTC) and the ' 'threshold used to determine if an hour was considered to be ' 'active was: {6:0.2f}.'.format(tagged_events, self.timeline_name, percent, total_count, first, last, threshold)) group = self.sketch.add_aggregation_group( name='Browser Activity Per Hour', description='Created by the browser timeframe analyzer') group.set_layered() params = { 'data': aggregation.to_dict(orient='records'), 'title': 'Browser Activity Per Hour ({0:s})'.format(self.timeline_name), 'field': 'hour', 'order_field': 'hour', } agg_obj = self.sketch.add_aggregation( name='Browser Activity Per Hour ({0:s})'.format( self.timeline_name), agg_name='manual_feed', agg_params=params, chart_type='barchart', description='Created by the browser timeframe analyzer', label='informational') group.add_aggregation(agg_obj) lines = [{'hour': x, 'count': threshold} for x in range(0, 24)] params = { 'data': lines, 'title': 'Browser Timeframe Threshold ({0:s})'.format( self.timeline_name), 'field': 'hour', 'order_field': 'hour', 'chart_color': 'red', } agg_line = self.sketch.add_aggregation( name='Browser Activity Per Hour ({0:s})'.format( self.timeline_name), agg_name='manual_feed', agg_params=params, chart_type='linechart', description='Created by the browser timeframe analyzer', label='informational') group.add_aggregation(agg_line) story.add_aggregation_group(group) return ('Tagged {0:d} out of {1:d} events as outside of normal ' 'active hours.').format(tagged_events, total_count)