def collect(self, jobs=None): dfs = [] # Removing the temporary split directory if it exists output_dir = getattr( list(jobs.values())[0].criteria, 'output_dir', None) if output_dir and os.path.exists(output_dir): shutil.rmtree(output_dir) for jid, job in jobs.items(): if job.status == Job.ERROR: raise AnalysisException( "%s for pcap file %s failed: %s" % (job, job.criteria.pcapfilename, job.message)) subdf = job.data() if subdf is None: continue dfs.append(subdf) if not dfs: logger.debug("%s: no data is collected" % self.__class__.__name__) return QueryComplete(None) df = pandas.concat(dfs, ignore_index=True) logger.debug("%s: Query ended." % self.__class__.__name__) return QueryComplete(df)
def collect(self, jobs=None): dfs = [] for jid, job in jobs.iteritems(): if job.status == Job.ERROR: raise AnalysisException( "Job for host '{}' failed: {}".format( job.criteria.dev.name, job.message)) subdf = job.data() if subdf is None: continue dfs.append(subdf) if not dfs: return QueryComplete(None) df = pandas.concat(dfs, ignore_index=True) def break_line(s): return s.replace('\n', '<br>') df['output'] = df['output'].apply(break_line) return QueryComplete(df)
def collect(self, jobs=None): dfs_from_jobs, dfs_from_db = [], [] objs = ExistingIntervals.objects.filter(table_handle=self.handle) if objs: obj = objs[0] # Query for the intervals already in db itvs_in_db = self.query_interval.intersection(obj.intervals) dfs_from_db = [ self.query(itv.start, itv.end) for itv in itvs_in_db ] else: obj = ExistingIntervals(namespace=self.ds_table.namespace, sourcefile=self.ds_table.sourcefile, table=self.ds_table.name, criteria=self.no_time_criteria, table_handle=self.handle, intervals=IntervalList([])) for job_id, job in jobs.iteritems(): df = job.data() if df is None: continue dfs_from_jobs.append(df) obj.intervals += \ TimeInterval(df[self.time_col].min().to_datetime(), df[self.time_col].max().to_datetime()) if not dfs_from_jobs: return QueryComplete(None) storage.write(index=make_index(self.ds_table.namespace), doctype=self.handle, data_frame=pandas.concat(dfs_from_jobs, ignore_index=True), timecol=self.time_col) obj.intervals = self._converge_adjacent(obj.intervals) obj.tzinfo = self.job.criteria.starttime.tzinfo # Only update existing intervals if writing to db succeeds obj.save() # Immediately reading from db after writing results can result in # non-correct data, thus stitching the data frames together total_df = pandas.concat(dfs_from_db + dfs_from_jobs, ignore_index=True) return QueryComplete(total_df.sort(self.time_col).drop_duplicates())
def run(self): # For each of the widget, get all the data profiler = DeviceManager.get_device(self.table.options.netprofiler_id) lr = LiveReport(profiler, template_id=self.table.options.template_id) # Figure out columns by querying the widget # cols = lr.get_columns(self.table.options.widget_id) # Find the query object query_idx = lr.get_query_names().index(self.table.options.query_id) # refresh the columns of the table self._refresh_columns(profiler, report=lr, query=lr.queries[query_idx]) data = lr.get_data(index=query_idx) col_names = [ col.label if col.ephemeral else col.key for col in lr.queries[query_idx].columns ] df = pd.DataFrame(columns=col_names, data=data) return QueryComplete(df)
def run(self): criteria = self.job.criteria netshark = DeviceManager.get_device(criteria.netshark_device) self.export_name = str( path_to_class(netshark, criteria.netshark_source_name)) source = netshark.get_capture_job_by_name(self.export_name) timefilter = TimeFilter(criteria.starttime, criteria.endtime) handle = Job._compute_handle(self.table, criteria) # check if pcaps directory exists, if not make the directory if not os.path.exists(PCAP_DIR): os.mkdir(PCAP_DIR) while self.all_pcap_size > settings.PCAP_SIZE_LIMIT: self.delete_oldest_pcap() self.filename = add_pcap_dir('%s.pcap' % handle) filters = ([BpfFilter(filt) for filt in self.table.options.filters] or None) with netshark.create_export( source, timefilter, filters=filters, wait_for_data=self.table.options.wait_for_data, wait_duration=self.table.options.wait_duration) as e: self.download(e) return QueryComplete(pandas.DataFrame([dict(filename=self.filename)]))
def run(self): """ Main execution method """ args = self._prepare_report_args() with lock: report = SingleQueryReport(args.profiler) report.run( realm=self.table.options.realm, groupby=args.profiler.groupbys[self.table.options.groupby], centricity=args.centricity, columns=args.columns, timefilter=args.timefilter, trafficexpr=args.trafficexpr, data_filter=args.datafilter, resolution=args.resolution, sort_col=args.sortcol, sync=False, limit=args.limit ) data = self._wait_for_data(report) if self.table.rows > 0: data = data[:self.table.rows] logger.info("Report %s returned %s rows" % (self.job, len(data))) return QueryComplete(data)
def analyze(self, jobs): """ Pivot data results from jobs """ job = jobs.values()[0] rs = self.table.options.resample_interval try: rs = '{0}s'.format(int(job.criteria.resample_interval)) except ValueError: logger.warning("{0}: resample_interval ({2}) not set or valid in " "job criteria {1}".format(self, job.criteria, rs)) job.criteria.resample_interval = u'{0}'.format(rs.split('s')[0]) df = job.data() rs_df = resample(df, self.table.options.resample_column, rs, self.table.options.resample_operation) curcols = [c.name for c in self.job.get_columns(synthetic=False)] jcols = [c.name for c in job.get_columns(synthetic=False)] for c in jcols: if c not in curcols: # Default data type is float. Column.create(self.job.table, name=c, label=c, ephemeral=self.job) return QueryComplete(rs_df)
def analyze(self, jobs): """ Pivot data results from jobs """ df = jobs.values()[0].data() if (self.table.options.pivot_column is None or self.table.options.pivot_value is None): msg = ('Both "pivot_column" and "pivot_value" options need ' 'to be specified for PivotTables.') logger.error(msg) return QueryError(msg) pivot = df.pivot(index=self.table.options.pivot_index, columns=self.table.options.pivot_column, values=self.table.options.pivot_value).reset_index() # since numeric values may now be columns, change them to strings # for proper pattern matching downstream pivot.rename(columns=lambda x: str(x), inplace=True) col_names = [x for x in pivot.columns] cur_cols = [c.name for c in self.job.get_columns(synthetic=False)] for c in col_names: if c not in cur_cols: label = self.table.options.pivot_column_prefix + c Column.create(self.job.table, name=c, label=label, ephemeral=self.job, datatype=self.table.options.pivot_datatype) return QueryComplete(pivot)
def post_run(self): columns = self.table.column_set.order_by('id') timecol = columns[0].name datacol = columns[1].name if self.data is not None: dft = self.data.set_index(timecol)[datacol] # add null value to beginning and end of time series to make sure # resample interval lines up start = self.starttime.astimezone(pytz.UTC) end = self.endtime.astimezone(pytz.UTC) dft[start] = numpy.nan dft[end] = numpy.nan # adjust the resample size depending on overall time interval delta = end - start if delta <= datetime.timedelta(minutes=1): resample = '1S' elif delta <= datetime.timedelta(minutes=15): resample = '60S' # 1 minute else: resample = '5T' # 5 minutes dft = dft.resample(resample, how='count') self.data = dft.reset_index().rename(columns={'index': timecol}) return QueryComplete(self.data)
def run(self): criteria = self.job.criteria values = [[str(k), str(v)] for k, v in criteria.iteritems()] values.append(['criteria.starttime', str(criteria.starttime)]) df = pandas.DataFrame(values, columns=['key', 'value']).sort('key') return QueryComplete(df)
def analyze(self, jobs): df = jobs['base'].data() criteria = self.job.criteria devid = criteria.appresponse_device duration = criteria.duration.seconds endtime = datetime_to_seconds(criteria.endtime) granularity = criteria.granularity.seconds def make_report_link(mod, v): s = ('<a href="/report/appresponse/{}/?' 'duration={}&appresponse_device={}&endtime={}&' 'pivot_column_names={}&granularity={}&auto_run=true" ' 'target="_blank">{}</a>'.format(mod, duration, devid, endtime, v, granularity, v)) return s make_report_link_with_mod = functools.partial( make_report_link, self.table.options.ts_report_mod_name) pivot_col = self.table.options.pivot_column_name df[pivot_col] = df[pivot_col].map(make_report_link_with_mod) return QueryComplete(df)
def run(self): # Collect all dependent tables options = self.table.options model = get_schema_map()[options.schema] df = model.objects.get_dataframe() if df.empty: return QueryError( 'No metrics defined for schema "%s". Add new metrics ' 'using the <a href="%s">admin interface</a>.' % (options.schema, reverse('admin:metrics_plugin_%s_changelist' % model.__name__.lower())) ) # Add some default columns as needed # new ones are created as normal columns vs ephemeral - the table # schema will not be dynamic, any changes will be done via code # changes and/or a report reload. # We check to see if some have already been defined to allow for # customization of the actual labels or column display keys = list(df.keys()) for k in keys: try: Column.objects.get(table=self.job.table, name=k) except ObjectDoesNotExist: Column.create(self.job.table, k, k.title(), datatype='string') logger.debug("%s: completed successfully" % self) return QueryComplete(df)
def analyze(self, jobs=None): filtered_list = ExistingIntervals.objects.filter( table_handle=self.handle) existing_intervals = None if filtered_list: existing_intervals = filtered_list[0].intervals if self.query_interval in existing_intervals: # Search DB for the queried data return QueryComplete( self.query(self.query_interval.start, self.query_interval.end)) intervals_to_call = self._check_intervals(self.query_interval - existing_intervals) dep_jobs = {} for interval in intervals_to_call: criteria = copy.copy(self.job.criteria) # Use the two time related fields criteria.starttime = interval.start criteria.endtime = interval.end job = Job.create(table=self.ds_table, criteria=criteria, update_progress=False, parent=self.job) dep_jobs[job.id] = job return QueryContinue(self.collect, jobs=dep_jobs)
def post_run(self): columns = self.table.column_set.order_by('id') groupby = columns[0].name if self.data is not None: dfg = self.data.groupby(groupby).count() self.data = dfg.reset_index() return QueryComplete(self.data)
def analyze(self, jobs): # Based on input pivot column names, i.e. CIFS, RTP, Facebook # using dataframe keyed by Application ID, and start time # derive dataframe keyed by start_time, with each row as # a dictionary keyed by input pivot values df = jobs['base'].data() # First clear all the dynamic columns that were associated with # the table last time the report is run # do not delete the time column for col in self.table.get_columns(): if col.name == 'time': continue col.delete() base_table = Table.from_ref(self.table.options.tables.base) time_col_name = None for col in base_table.get_columns(): if col.datatype == Column.DATATYPE_TIME and col.iskey: time_col_name = col.name break if not time_col_name: raise AppResponseException("No key 'time' column defined " "in base table") pivot_column = self.table.options.pivot_column_name sub_dfs = [] for pivot in self.job.criteria.pivot_column_names.split(','): # Add pivot column to the table pivot = pivot.strip() AppResponseColumn.create(self.table, pivot, pivot) # Add pivot column to the data frame sub_df = df[df[pivot_column] == pivot] # extract time column and value column sub_df = sub_df[[ time_col_name, self.table.options.value_column_name ]] # Rename columns to 'time' and the pivot column name sub_df.rename(columns={ time_col_name: u'time', self.table.options.value_column_name: pivot }, inplace=True) sub_dfs.append(sub_df) df_final = reduce( lambda df1, df2: pandas.merge(df1, df2, on=u'time', how='outer'), sub_dfs) return QueryComplete(df_final)
def run(self): sh = DeviceManager.get_device(self.job.criteria.steelhead_device) flows = Model.get(sh, feature='flows') res = flows.show_flows('all') for k, v in res['flows_summary'].iteritems(): v['category'] = k return QueryComplete(res['flows_summary'].values())
def run(self): sh_db = self.job.criteria.dev cmd = self.job.criteria.command sh = DeviceManager.get_device(sh_db.id) output = sh.cli.exec_command(cmd, mode=CLIMode.ENABLE) return QueryComplete([dict(dev_name=sh_db.name, output=output)])
def post_run(self): df = self.data if df is not None: if 'eventid' in df: make_link = functools.partial(self.make_link, 'event-lookup') df['eventid'] = df['eventid'].map(make_link) elif 'id' in df: make_link = functools.partial(self.make_link, 'alert-detail') df['id'] = df['id'].map(make_link) self.data = df return QueryComplete(self.data)
def run(self): criteria = self.job.criteria if criteria.scc_device == '': logger.debug('%s: No scc device selected' % (self.table)) self.job.mark_error("No SCC Device Selected") return False columns = [col.name for col in self.table.get_columns(synthetic=False)] scc = DeviceManager.get_device(criteria.scc_device) # obtain the report class definition report_cls = get_scc_report_class(self.service, self.resource) # instatiate a report object report_obj = report_cls(scc) # Build criteria kwargs kwargs = {} for name in set(report_obj.required_fields + report_obj.non_required_fields): # criteria has attrs as starttime, endtime # which maps to start_time and end_time # referenced in a SCC service if name in ['start_time', 'end_time']: name_in_criteria = name.replace('_', '') else: name_in_criteria = name if hasattr(criteria, name_in_criteria): kwargs[name] = getattr(criteria, name_in_criteria) report_obj.run(**kwargs) df = self.extract_dataframe(report_obj.data) if df is not None: for col in columns: if col not in df: raise KeyError("Table %s has no column '%s'" % (self.job.table.name, col)) df = df.ix[:, columns] self.data = df logger.info("SCC job %s returning %d rows of data" % (self.job, len(self.data))) else: self.data = None return QueryComplete(self.data)
def collect(self, jobs=None): out = [] for jid, job in jobs.iteritems(): sharkdata = job.data() if sharkdata is not None: s = Device.objects.get(id=job.criteria.netshark_device) out.append([ s.name, s.host, job.criteria.netshark_source_name[5:], sharkdata['generic_bytes'] ]) columns = ['name', 'host', 'capjob', 'bytes'] df = pandas.DataFrame(out, columns=columns) return QueryComplete(df)
def run(self): sh = DeviceManager.get_device(self.job.criteria.steelhead_device) stats = Model.get(sh, feature='stats') duration = self.job.criteria.duration directions = ['lan-to-wan', 'wan-to-lan', 'bi-directional'] total = [] for d in directions: res = stats.show_stats_bandwidth('all', d, duration) res['direction'] = d total.append(res) return QueryComplete(total)
def run(self): criteria = self.job.criteria profiler = DeviceManager.get_device(criteria.netprofiler_device) widget_config = profiler.api.templates.get_config(criteria.template_id) recs = [] for w in widget_config: dict0 = {'template_id': str(criteria.template_id)} dict1 = dict((k, w[k]) for k in ['widget_id', 'title']) dict2 = dict( (k, w['config'][k]) for k in ['widget_type', 'visualization', 'datasource']) recs.append( dict((k, v) for d in [dict0, dict1, dict2] for k, v in d.iteritems())) return QueryComplete(pd.DataFrame(recs))
def run(self): obj_class = self.table.options.obj_class feature = self.table.options.feature method = self.table.options.method args = self.table.options.args sh = DeviceManager.get_device(self.job.criteria.steelhead_device) obj = obj_class.get(sh, feature=feature) res = getattr(obj, method)(*args) if not isinstance(res, list): res = [res] for e in res: for k, v in e.iteritems(): e[k] = str(v) return QueryComplete(res)
def analyze(self, jobs=None): logger.debug('TimeSeriesTable analysis with jobs %s' % jobs) filtered_list = ExistingIntervals.objects.filter( table_handle=self.handle, criteria=self.no_time_criteria) existing_intervals = None if filtered_list: existing_intervals = filtered_list[0].intervals logger.debug('Found existing intervals for handle %s: %s' % (self.handle, existing_intervals)) if self.query_interval in existing_intervals: logger.debug('Query interval totally covered by DB, returning ' 'DB query.') # Search DB for the queried data data = self.query(self.query_interval.start, self.query_interval.end) return QueryComplete(data) logger.debug('Query interval only partially covered by DB ...') intervals_to_call = self._check_intervals(self.query_interval - existing_intervals) logger.debug('Setting up %d jobs to cover missing data ' 'for these intervals: %s' % (len(intervals_to_call), intervals_to_call)) dep_jobs = {} for interval in intervals_to_call: criteria = copy.copy(self.job.criteria) # Use the two time related fields criteria.starttime = interval.start criteria.endtime = interval.end job = Job.create(table=self.ds_table, criteria=criteria, update_progress=False, parent=self.job) dep_jobs[job.id] = job return QueryContinue(self.collect, jobs=dep_jobs)
def run(self): sks = Device.objects.filter(enabled=True, module='netshark') res = [] for sk in sks: sk_dev = DeviceManager.get_device(sk.id) for job in sk_dev.get_capture_jobs(): if_name = job.data['config']['interface_name'] start = str(nsec_string_to_datetime(job.packet_start_time)) end = str(nsec_string_to_datetime(job.packet_end_time)) bpf_filter = job.data['config'].get('bpf_filter', '') if len(bpf_filter) > self.MAX_LENGTH: bpf_filter = bpf_filter[:self.MAX_LENGTH - 2] + '...' pkts_dropped = job.get_stats()['packets_dropped'] pkts_written = job.get_stats()['packets_written'] job_data = dict(netshark=sk.name, job_id=job.data['id'], job_name=job.data['config']['name'], interface=if_name, state=job.data['status']['state'], size=job.data['status']['packet_size'], start_time=start, end_time=end, bpf_filter=bpf_filter, dpi_enabled=str(job.dpi_enabled), index_enabled=str(job.index_enabled), last_sec_dropped=pkts_dropped['last_second'], last_min_dropped=pkts_dropped['last_minute'], last_hr_dropped=pkts_dropped['last_hour'], last_sec_written=pkts_written['last_second'], last_min_written=pkts_written['last_minute'], last_hr_written=pkts_written['last_hour']) res.append(job_data) return QueryComplete(pandas.DataFrame(res))
def post_run(self): """Execute any Functions saved to Table. In most cases, this function will be simply overridden by a subclass which will implement its own detailed processing. This method provides a shortcut to support passing a Function directly to the create method. """ options = self.table.options if options.function is None: return QueryError("Table %s has no analysis function defined" % self.table) try: df = options.function(self, options.tables, self.job.criteria) except Exception as e: return QueryError("Analysis function %s failed" % options.function, e) logger.debug("%s: completed successfully" % self) return QueryComplete(df)
def collect(self, jobs=None): out = [] for jid, job in jobs.iteritems(): ardata = job.data() if ardata is not None: total_bytes = ardata['total_bytes'].sum() if total_bytes: s = Device.objects.get(id=job.criteria.appresponse_device) out.append([s.name, s.host, job.criteria.appresponse_source, total_bytes]) if not out: out.append([ 'No capture jobs found', '--', '--', '' ]) columns = ['name', 'host', 'capture_job', 'bytes'] df = pandas.DataFrame(out, columns=columns) return QueryComplete(df)
def run(self): """ Main execution method. """ args = self._prepare_report_args() with lock: report = MultiQueryReport(args.profiler) report.run(template_id=self.table.options.template_id, timefilter=args.timefilter, trafficexpr=args.trafficexpr, resolution=args.resolution) data = self._wait_for_data(report) headers = report.get_legend() # create dataframe with all of the default headers df = pandas.DataFrame(data, columns=[h.key for h in headers]) # now filter down to the columns requested by the table columns = [col.name for col in self.table.get_columns(synthetic=False)] df = df[columns] logger.info("Report %s returned %s rows" % (self.job, len(df))) return QueryComplete(df)
def collect(self, jobs=None): logger.debug('TimeSeriesTable collect with jobs %s' % jobs) dfs_from_jobs, dfs_from_db = [], [] objs = ExistingIntervals.objects.filter(table_handle=self.handle, criteria=self.no_time_criteria) if objs: # we should only find one with our handle if len(objs) > 1: logger.warning('Multiple instances of ExistingIntervals found ' 'for handle %s, taking first one.' % self.handle) obj = objs[0] # Query for the intervals already in db itvs_in_db = self.query_interval.intersection(obj.intervals) dfs_from_db = [ self.query(itv.start, itv.end) for itv in itvs_in_db ] else: logger.debug( 'Creating new ExistingIntervals object for ' 'namespace: %s, sourcefile: %s, table: %s, handle: %s' % (self.ds_table.namespace, self.ds_table.sourcefile, self.ds_table.name, self.handle)) obj = ExistingIntervals(namespace=self.ds_table.namespace, sourcefile=self.ds_table.sourcefile, table=self.ds_table.name, criteria=self.no_time_criteria, table_handle=self.handle, intervals=IntervalList([])) job_intervals = obj.intervals for job_id, job in jobs.iteritems(): df = job.data() if df is None: continue dfs_from_jobs.append(df) # evaluate job time interval extents # Handle case where data returned slightly smaller than base query. # This can be caused because data doesn't always align with query # edges, but as a result any future queries of the same requested # time interval would return the same results. # if our new interval is entirely within the original query, # and the deltas at each of the edges is less than the resolution # of the table, then we got the best we could hope for, # and we should just add the interval as originally requested. job_criteria_start = job.criteria.starttime job_criteria_end = job.criteria.endtime job_data_start = df[self.time_col].min().to_datetime() job_data_end = df[self.time_col].max().to_datetime() if abs(job_data_start - job_criteria_start) < self.resolution: interval_start = job_criteria_start else: interval_start = job_data_start if abs(job_criteria_end - job_data_end) < self.resolution: interval_end = job_criteria_end else: interval_end = job_data_end logger.debug('Job time intervals: Criteria - (%s, %s), ' 'Data - (%s, %s)' % (job_criteria_start, job_criteria_end, job_data_start, job_data_end)) interval = TimeInterval(interval_start, interval_end) logger.debug('Appending TimeInterval: %s' % interval) job_intervals += interval if not dfs_from_jobs: return QueryComplete(None) if self.table.options.override_index: index = self.table.options.override_index else: index = make_index(self.ds_table.namespace) storage.write(index=index, doctype=self.handle, data_frame=pandas.concat(dfs_from_jobs, ignore_index=True), timecol=self.time_col, id_method=self.table.options.id_method) obj.intervals = self._converge_adjacent(job_intervals) obj.tzinfo = self.job.criteria.starttime.tzinfo # Only update existing intervals if writing to db succeeds obj.save() # Immediately reading from db after writing results can result in # non-correct data, thus stitching the data frames together total_df = pandas.concat(dfs_from_db + dfs_from_jobs, ignore_index=True) data = total_df.sort(self.time_col).drop_duplicates() return QueryComplete(data)
def collect(self, jobs): df = jobs['ts'].data() return QueryComplete(df)