def collect(self, jobs=None):
        dfs = []

        # Removing the temporary split directory if it exists
        output_dir = getattr(
            list(jobs.values())[0].criteria, 'output_dir', None)
        if output_dir and os.path.exists(output_dir):
            shutil.rmtree(output_dir)

        for jid, job in jobs.items():
            if job.status == Job.ERROR:
                raise AnalysisException(
                    "%s for pcap file %s failed: %s" %
                    (job, job.criteria.pcapfilename, job.message))
            subdf = job.data()
            if subdf is None:
                continue
            dfs.append(subdf)

        if not dfs:
            logger.debug("%s: no data is collected" % self.__class__.__name__)
            return QueryComplete(None)

        df = pandas.concat(dfs, ignore_index=True)

        logger.debug("%s: Query ended." % self.__class__.__name__)

        return QueryComplete(df)
Exemplo n.º 2
0
    def collect(self, jobs=None):
        dfs = []

        for jid, job in jobs.iteritems():
            if job.status == Job.ERROR:
                raise AnalysisException(
                    "Job for host '{}' failed: {}".format(
                        job.criteria.dev.name, job.message))

            subdf = job.data()
            if subdf is None:
                continue
            dfs.append(subdf)

        if not dfs:
            return QueryComplete(None)

        df = pandas.concat(dfs, ignore_index=True)

        def break_line(s):
            return s.replace('\n', '<br>')

        df['output'] = df['output'].apply(break_line)

        return QueryComplete(df)
Exemplo n.º 3
0
    def collect(self, jobs=None):
        dfs_from_jobs, dfs_from_db = [], []

        objs = ExistingIntervals.objects.filter(table_handle=self.handle)

        if objs:
            obj = objs[0]

            # Query for the intervals already in db
            itvs_in_db = self.query_interval.intersection(obj.intervals)
            dfs_from_db = [
                self.query(itv.start, itv.end) for itv in itvs_in_db
            ]
        else:
            obj = ExistingIntervals(namespace=self.ds_table.namespace,
                                    sourcefile=self.ds_table.sourcefile,
                                    table=self.ds_table.name,
                                    criteria=self.no_time_criteria,
                                    table_handle=self.handle,
                                    intervals=IntervalList([]))

        for job_id, job in jobs.iteritems():
            df = job.data()
            if df is None:
                continue
            dfs_from_jobs.append(df)

            obj.intervals += \
                TimeInterval(df[self.time_col].min().to_datetime(),
                             df[self.time_col].max().to_datetime())

        if not dfs_from_jobs:
            return QueryComplete(None)

        storage.write(index=make_index(self.ds_table.namespace),
                      doctype=self.handle,
                      data_frame=pandas.concat(dfs_from_jobs,
                                               ignore_index=True),
                      timecol=self.time_col)

        obj.intervals = self._converge_adjacent(obj.intervals)

        obj.tzinfo = self.job.criteria.starttime.tzinfo

        # Only update existing intervals if writing to db succeeds
        obj.save()

        # Immediately reading from db after writing results can result in
        # non-correct data, thus stitching the data frames together
        total_df = pandas.concat(dfs_from_db + dfs_from_jobs,
                                 ignore_index=True)
        return QueryComplete(total_df.sort(self.time_col).drop_duplicates())
Exemplo n.º 4
0
    def run(self):

        # For each of the widget, get all the data
        profiler = DeviceManager.get_device(self.table.options.netprofiler_id)

        lr = LiveReport(profiler, template_id=self.table.options.template_id)

        # Figure out columns by querying the widget
        # cols = lr.get_columns(self.table.options.widget_id)

        # Find the query object
        query_idx = lr.get_query_names().index(self.table.options.query_id)

        # refresh the columns of the table
        self._refresh_columns(profiler, report=lr, query=lr.queries[query_idx])

        data = lr.get_data(index=query_idx)

        col_names = [
            col.label if col.ephemeral else col.key
            for col in lr.queries[query_idx].columns
        ]

        df = pd.DataFrame(columns=col_names, data=data)

        return QueryComplete(df)
    def run(self):
        criteria = self.job.criteria

        netshark = DeviceManager.get_device(criteria.netshark_device)

        self.export_name = str(
            path_to_class(netshark, criteria.netshark_source_name))

        source = netshark.get_capture_job_by_name(self.export_name)

        timefilter = TimeFilter(criteria.starttime, criteria.endtime)

        handle = Job._compute_handle(self.table, criteria)

        # check if pcaps directory exists, if not make the directory
        if not os.path.exists(PCAP_DIR):
            os.mkdir(PCAP_DIR)

        while self.all_pcap_size > settings.PCAP_SIZE_LIMIT:
            self.delete_oldest_pcap()

        self.filename = add_pcap_dir('%s.pcap' % handle)

        filters = ([BpfFilter(filt) for filt in self.table.options.filters]
                   or None)
        with netshark.create_export(
                source,
                timefilter,
                filters=filters,
                wait_for_data=self.table.options.wait_for_data,
                wait_duration=self.table.options.wait_duration) as e:
            self.download(e)

        return QueryComplete(pandas.DataFrame([dict(filename=self.filename)]))
Exemplo n.º 6
0
    def run(self):
        """ Main execution method
        """
        args = self._prepare_report_args()

        with lock:
            report = SingleQueryReport(args.profiler)
            report.run(
                realm=self.table.options.realm,
                groupby=args.profiler.groupbys[self.table.options.groupby],
                centricity=args.centricity,
                columns=args.columns,
                timefilter=args.timefilter,
                trafficexpr=args.trafficexpr,
                data_filter=args.datafilter,
                resolution=args.resolution,
                sort_col=args.sortcol,
                sync=False,
                limit=args.limit
            )

        data = self._wait_for_data(report)

        if self.table.rows > 0:
            data = data[:self.table.rows]

        logger.info("Report %s returned %s rows" % (self.job, len(data)))
        return QueryComplete(data)
Exemplo n.º 7
0
    def analyze(self, jobs):
        """ Pivot data results from jobs """
        job = jobs.values()[0]

        rs = self.table.options.resample_interval
        try:
            rs = '{0}s'.format(int(job.criteria.resample_interval))
        except ValueError:
            logger.warning("{0}: resample_interval ({2}) not set or valid in "
                           "job criteria {1}".format(self, job.criteria, rs))

            job.criteria.resample_interval = u'{0}'.format(rs.split('s')[0])

        df = job.data()
        rs_df = resample(df, self.table.options.resample_column, rs,
                         self.table.options.resample_operation)

        curcols = [c.name for c in self.job.get_columns(synthetic=False)]
        jcols = [c.name for c in job.get_columns(synthetic=False)]
        for c in jcols:
            if c not in curcols:
                # Default data type is float.
                Column.create(self.job.table,
                              name=c,
                              label=c,
                              ephemeral=self.job)

        return QueryComplete(rs_df)
Exemplo n.º 8
0
    def analyze(self, jobs):
        """ Pivot data results from jobs """

        df = jobs.values()[0].data()

        if (self.table.options.pivot_column is None
                or self.table.options.pivot_value is None):
            msg = ('Both "pivot_column" and "pivot_value" options need '
                   'to be specified for PivotTables.')
            logger.error(msg)
            return QueryError(msg)

        pivot = df.pivot(index=self.table.options.pivot_index,
                         columns=self.table.options.pivot_column,
                         values=self.table.options.pivot_value).reset_index()

        # since numeric values may now be columns, change them to strings
        # for proper pattern matching downstream
        pivot.rename(columns=lambda x: str(x), inplace=True)

        col_names = [x for x in pivot.columns]
        cur_cols = [c.name for c in self.job.get_columns(synthetic=False)]

        for c in col_names:
            if c not in cur_cols:
                label = self.table.options.pivot_column_prefix + c
                Column.create(self.job.table,
                              name=c,
                              label=label,
                              ephemeral=self.job,
                              datatype=self.table.options.pivot_datatype)

        return QueryComplete(pivot)
Exemplo n.º 9
0
    def post_run(self):
        columns = self.table.column_set.order_by('id')
        timecol = columns[0].name
        datacol = columns[1].name

        if self.data is not None:
            dft = self.data.set_index(timecol)[datacol]
            # add null value to beginning and end of time series to make sure
            # resample interval lines up
            start = self.starttime.astimezone(pytz.UTC)
            end = self.endtime.astimezone(pytz.UTC)
            dft[start] = numpy.nan
            dft[end] = numpy.nan

            # adjust the resample size depending on overall time interval
            delta = end - start
            if delta <= datetime.timedelta(minutes=1):
                resample = '1S'
            elif delta <= datetime.timedelta(minutes=15):
                resample = '60S'  # 1 minute
            else:
                resample = '5T'   # 5 minutes

            dft = dft.resample(resample, how='count')
            self.data = dft.reset_index().rename(columns={'index': timecol})
        return QueryComplete(self.data)
Exemplo n.º 10
0
    def run(self):
        criteria = self.job.criteria
        values = [[str(k), str(v)] for k, v in criteria.iteritems()]
        values.append(['criteria.starttime', str(criteria.starttime)])
        df = pandas.DataFrame(values, columns=['key', 'value']).sort('key')

        return QueryComplete(df)
Exemplo n.º 11
0
    def analyze(self, jobs):

        df = jobs['base'].data()

        criteria = self.job.criteria

        devid = criteria.appresponse_device
        duration = criteria.duration.seconds
        endtime = datetime_to_seconds(criteria.endtime)
        granularity = criteria.granularity.seconds

        def make_report_link(mod, v):
            s = ('<a href="/report/appresponse/{}/?'
                 'duration={}&appresponse_device={}&endtime={}&'
                 'pivot_column_names={}&granularity={}&auto_run=true" '
                 'target="_blank">{}</a>'.format(mod, duration, devid, endtime,
                                                 v, granularity, v))
            return s

        make_report_link_with_mod = functools.partial(
            make_report_link, self.table.options.ts_report_mod_name)

        pivot_col = self.table.options.pivot_column_name
        df[pivot_col] = df[pivot_col].map(make_report_link_with_mod)

        return QueryComplete(df)
Exemplo n.º 12
0
    def run(self):
        # Collect all dependent tables
        options = self.table.options

        model = get_schema_map()[options.schema]
        df = model.objects.get_dataframe()

        if df.empty:
            return QueryError(
                'No metrics defined for schema "%s".  Add new metrics '
                'using the <a href="%s">admin interface</a>.'
                % (options.schema,
                   reverse('admin:metrics_plugin_%s_changelist'
                           % model.__name__.lower()))
            )

        # Add some default columns as needed
        # new ones are created as normal columns vs ephemeral - the table
        # schema will not be dynamic, any changes will be done via code
        # changes and/or a report reload.

        # We check to see if some have already been defined to allow for
        # customization of the actual labels or column display
        keys = list(df.keys())

        for k in keys:
            try:
                Column.objects.get(table=self.job.table, name=k)
            except ObjectDoesNotExist:
                Column.create(self.job.table, k, k.title(), datatype='string')

        logger.debug("%s: completed successfully" % self)
        return QueryComplete(df)
Exemplo n.º 13
0
    def analyze(self, jobs=None):

        filtered_list = ExistingIntervals.objects.filter(
            table_handle=self.handle)

        existing_intervals = None

        if filtered_list:
            existing_intervals = filtered_list[0].intervals

            if self.query_interval in existing_intervals:
                # Search DB for the queried data
                return QueryComplete(
                    self.query(self.query_interval.start,
                               self.query_interval.end))

        intervals_to_call = self._check_intervals(self.query_interval -
                                                  existing_intervals)

        dep_jobs = {}
        for interval in intervals_to_call:
            criteria = copy.copy(self.job.criteria)
            # Use the two time related fields
            criteria.starttime = interval.start
            criteria.endtime = interval.end
            job = Job.create(table=self.ds_table,
                             criteria=criteria,
                             update_progress=False,
                             parent=self.job)
            dep_jobs[job.id] = job

        return QueryContinue(self.collect, jobs=dep_jobs)
Exemplo n.º 14
0
    def post_run(self):
        columns = self.table.column_set.order_by('id')
        groupby = columns[0].name

        if self.data is not None:
            dfg = self.data.groupby(groupby).count()
            self.data = dfg.reset_index()
        return QueryComplete(self.data)
Exemplo n.º 15
0
    def analyze(self, jobs):
        # Based on input pivot column names, i.e. CIFS, RTP, Facebook
        # using dataframe keyed by Application ID, and start time
        # derive dataframe keyed by start_time, with each row as
        # a dictionary keyed by input pivot values

        df = jobs['base'].data()
        # First clear all the dynamic columns that were associated with
        # the table last time the report is run
        # do not delete the time column
        for col in self.table.get_columns():
            if col.name == 'time':
                continue
            col.delete()

        base_table = Table.from_ref(self.table.options.tables.base)

        time_col_name = None
        for col in base_table.get_columns():
            if col.datatype == Column.DATATYPE_TIME and col.iskey:
                time_col_name = col.name
                break

        if not time_col_name:
            raise AppResponseException("No key 'time' column defined "
                                       "in base table")

        pivot_column = self.table.options.pivot_column_name

        sub_dfs = []
        for pivot in self.job.criteria.pivot_column_names.split(','):
            # Add pivot column to the table
            pivot = pivot.strip()
            AppResponseColumn.create(self.table, pivot, pivot)

            # Add pivot column to the data frame
            sub_df = df[df[pivot_column] == pivot]

            # extract time column and value column
            sub_df = sub_df[[
                time_col_name, self.table.options.value_column_name
            ]]
            # Rename columns to 'time' and the pivot column name
            sub_df.rename(columns={
                time_col_name: u'time',
                self.table.options.value_column_name: pivot
            },
                          inplace=True)

            sub_dfs.append(sub_df)

        df_final = reduce(
            lambda df1, df2: pandas.merge(df1, df2, on=u'time', how='outer'),
            sub_dfs)

        return QueryComplete(df_final)
Exemplo n.º 16
0
    def run(self):
        sh = DeviceManager.get_device(self.job.criteria.steelhead_device)

        flows = Model.get(sh, feature='flows')
        res = flows.show_flows('all')

        for k, v in res['flows_summary'].iteritems():
            v['category'] = k

        return QueryComplete(res['flows_summary'].values())
Exemplo n.º 17
0
    def run(self):

        sh_db = self.job.criteria.dev

        cmd = self.job.criteria.command

        sh = DeviceManager.get_device(sh_db.id)
        output = sh.cli.exec_command(cmd, mode=CLIMode.ENABLE)

        return QueryComplete([dict(dev_name=sh_db.name, output=output)])
Exemplo n.º 18
0
 def post_run(self):
     df = self.data
     if df is not None:
         if 'eventid' in df:
             make_link = functools.partial(self.make_link, 'event-lookup')
             df['eventid'] = df['eventid'].map(make_link)
         elif 'id' in df:
             make_link = functools.partial(self.make_link, 'alert-detail')
             df['id'] = df['id'].map(make_link)
         self.data = df
     return QueryComplete(self.data)
Exemplo n.º 19
0
    def run(self):

        criteria = self.job.criteria

        if criteria.scc_device == '':
            logger.debug('%s: No scc device selected' % (self.table))
            self.job.mark_error("No SCC Device Selected")
            return False

        columns = [col.name for col in self.table.get_columns(synthetic=False)]

        scc = DeviceManager.get_device(criteria.scc_device)

        # obtain the report class definition
        report_cls = get_scc_report_class(self.service, self.resource)

        # instatiate a report object
        report_obj = report_cls(scc)

        # Build criteria kwargs
        kwargs = {}
        for name in set(report_obj.required_fields +
                        report_obj.non_required_fields):
            # criteria has attrs as starttime, endtime
            # which maps to start_time and end_time
            # referenced in a SCC service
            if name in ['start_time', 'end_time']:
                name_in_criteria = name.replace('_', '')
            else:
                name_in_criteria = name

            if hasattr(criteria, name_in_criteria):
                kwargs[name] = getattr(criteria, name_in_criteria)
        report_obj.run(**kwargs)

        df = self.extract_dataframe(report_obj.data)

        if df is not None:
            for col in columns:
                if col not in df:
                    raise KeyError("Table %s has no column '%s'" %
                                   (self.job.table.name, col))

            df = df.ix[:, columns]

            self.data = df

            logger.info("SCC job %s returning %d rows of data" %
                        (self.job, len(self.data)))
        else:
            self.data = None
        return QueryComplete(self.data)
    def collect(self, jobs=None):

        out = []
        for jid, job in jobs.iteritems():
            sharkdata = job.data()
            if sharkdata is not None:
                s = Device.objects.get(id=job.criteria.netshark_device)
                out.append([
                    s.name, s.host, job.criteria.netshark_source_name[5:],
                    sharkdata['generic_bytes']
                ])

        columns = ['name', 'host', 'capjob', 'bytes']
        df = pandas.DataFrame(out, columns=columns)
        return QueryComplete(df)
Exemplo n.º 21
0
    def run(self):

        sh = DeviceManager.get_device(self.job.criteria.steelhead_device)

        stats = Model.get(sh, feature='stats')
        duration = self.job.criteria.duration
        directions = ['lan-to-wan', 'wan-to-lan', 'bi-directional']

        total = []
        for d in directions:
            res = stats.show_stats_bandwidth('all', d, duration)
            res['direction'] = d
            total.append(res)

        return QueryComplete(total)
Exemplo n.º 22
0
    def run(self):

        criteria = self.job.criteria
        profiler = DeviceManager.get_device(criteria.netprofiler_device)
        widget_config = profiler.api.templates.get_config(criteria.template_id)
        recs = []
        for w in widget_config:
            dict0 = {'template_id': str(criteria.template_id)}
            dict1 = dict((k, w[k]) for k in ['widget_id', 'title'])
            dict2 = dict(
                (k, w['config'][k])
                for k in ['widget_type', 'visualization', 'datasource'])
            recs.append(
                dict((k, v) for d in [dict0, dict1, dict2]
                     for k, v in d.iteritems()))

        return QueryComplete(pd.DataFrame(recs))
Exemplo n.º 23
0
    def run(self):

        obj_class = self.table.options.obj_class
        feature = self.table.options.feature
        method = self.table.options.method
        args = self.table.options.args

        sh = DeviceManager.get_device(self.job.criteria.steelhead_device)
        obj = obj_class.get(sh, feature=feature)
        res = getattr(obj, method)(*args)

        if not isinstance(res, list):
            res = [res]

        for e in res:
            for k, v in e.iteritems():
                e[k] = str(v)

        return QueryComplete(res)
Exemplo n.º 24
0
    def analyze(self, jobs=None):
        logger.debug('TimeSeriesTable analysis with jobs %s' % jobs)

        filtered_list = ExistingIntervals.objects.filter(
            table_handle=self.handle, criteria=self.no_time_criteria)

        existing_intervals = None

        if filtered_list:
            existing_intervals = filtered_list[0].intervals
            logger.debug('Found existing intervals for handle %s: %s' %
                         (self.handle, existing_intervals))

            if self.query_interval in existing_intervals:
                logger.debug('Query interval totally covered by DB, returning '
                             'DB query.')
                # Search DB for the queried data
                data = self.query(self.query_interval.start,
                                  self.query_interval.end)
                return QueryComplete(data)

            logger.debug('Query interval only partially covered by DB ...')

        intervals_to_call = self._check_intervals(self.query_interval -
                                                  existing_intervals)

        logger.debug('Setting up %d jobs to cover missing data '
                     'for these intervals: %s' %
                     (len(intervals_to_call), intervals_to_call))
        dep_jobs = {}
        for interval in intervals_to_call:
            criteria = copy.copy(self.job.criteria)
            # Use the two time related fields
            criteria.starttime = interval.start
            criteria.endtime = interval.end
            job = Job.create(table=self.ds_table,
                             criteria=criteria,
                             update_progress=False,
                             parent=self.job)
            dep_jobs[job.id] = job

        return QueryContinue(self.collect, jobs=dep_jobs)
Exemplo n.º 25
0
    def run(self):

        sks = Device.objects.filter(enabled=True, module='netshark')

        res = []
        for sk in sks:
            sk_dev = DeviceManager.get_device(sk.id)
            for job in sk_dev.get_capture_jobs():

                if_name = job.data['config']['interface_name']

                start = str(nsec_string_to_datetime(job.packet_start_time))
                end = str(nsec_string_to_datetime(job.packet_end_time))

                bpf_filter = job.data['config'].get('bpf_filter', '')

                if len(bpf_filter) > self.MAX_LENGTH:
                    bpf_filter = bpf_filter[:self.MAX_LENGTH - 2] + '...'

                pkts_dropped = job.get_stats()['packets_dropped']
                pkts_written = job.get_stats()['packets_written']

                job_data = dict(netshark=sk.name,
                                job_id=job.data['id'],
                                job_name=job.data['config']['name'],
                                interface=if_name,
                                state=job.data['status']['state'],
                                size=job.data['status']['packet_size'],
                                start_time=start,
                                end_time=end,
                                bpf_filter=bpf_filter,
                                dpi_enabled=str(job.dpi_enabled),
                                index_enabled=str(job.index_enabled),
                                last_sec_dropped=pkts_dropped['last_second'],
                                last_min_dropped=pkts_dropped['last_minute'],
                                last_hr_dropped=pkts_dropped['last_hour'],
                                last_sec_written=pkts_written['last_second'],
                                last_min_written=pkts_written['last_minute'],
                                last_hr_written=pkts_written['last_hour'])
                res.append(job_data)

        return QueryComplete(pandas.DataFrame(res))
Exemplo n.º 26
0
    def post_run(self):
        """Execute any Functions saved to Table.

        In most cases, this function will be simply overridden by a
        subclass which will implement its own detailed processing.  This
        method provides a shortcut to support passing a Function
        directly to the create method.
        """
        options = self.table.options
        if options.function is None:
            return QueryError("Table %s has no analysis function defined" %
                              self.table)

        try:
            df = options.function(self, options.tables, self.job.criteria)

        except Exception as e:
            return QueryError("Analysis function %s failed" % options.function,
                              e)

        logger.debug("%s: completed successfully" % self)
        return QueryComplete(df)
Exemplo n.º 27
0
    def collect(self, jobs=None):

        out = []
        for jid, job in jobs.iteritems():
            ardata = job.data()
            if ardata is not None:
                total_bytes = ardata['total_bytes'].sum()
                if total_bytes:
                    s = Device.objects.get(id=job.criteria.appresponse_device)
                    out.append([s.name,
                                s.host,
                                job.criteria.appresponse_source,
                                total_bytes])

        if not out:
            out.append([
                'No capture jobs found', '--', '--', ''
            ])

        columns = ['name', 'host', 'capture_job', 'bytes']
        df = pandas.DataFrame(out, columns=columns)
        return QueryComplete(df)
Exemplo n.º 28
0
    def run(self):
        """ Main execution method. """
        args = self._prepare_report_args()

        with lock:
            report = MultiQueryReport(args.profiler)
            report.run(template_id=self.table.options.template_id,
                       timefilter=args.timefilter,
                       trafficexpr=args.trafficexpr,
                       resolution=args.resolution)

        data = self._wait_for_data(report)
        headers = report.get_legend()

        # create dataframe with all of the default headers
        df = pandas.DataFrame(data, columns=[h.key for h in headers])

        # now filter down to the columns requested by the table
        columns = [col.name for col in self.table.get_columns(synthetic=False)]
        df = df[columns]

        logger.info("Report %s returned %s rows" % (self.job, len(df)))
        return QueryComplete(df)
Exemplo n.º 29
0
    def collect(self, jobs=None):
        logger.debug('TimeSeriesTable collect with jobs %s' % jobs)
        dfs_from_jobs, dfs_from_db = [], []

        objs = ExistingIntervals.objects.filter(table_handle=self.handle,
                                                criteria=self.no_time_criteria)

        if objs:
            # we should only find one with our handle
            if len(objs) > 1:
                logger.warning('Multiple instances of ExistingIntervals found '
                               'for handle %s, taking first one.' %
                               self.handle)
            obj = objs[0]

            # Query for the intervals already in db
            itvs_in_db = self.query_interval.intersection(obj.intervals)
            dfs_from_db = [
                self.query(itv.start, itv.end) for itv in itvs_in_db
            ]
        else:
            logger.debug(
                'Creating new ExistingIntervals object for '
                'namespace: %s, sourcefile: %s, table: %s, handle: %s' %
                (self.ds_table.namespace, self.ds_table.sourcefile,
                 self.ds_table.name, self.handle))

            obj = ExistingIntervals(namespace=self.ds_table.namespace,
                                    sourcefile=self.ds_table.sourcefile,
                                    table=self.ds_table.name,
                                    criteria=self.no_time_criteria,
                                    table_handle=self.handle,
                                    intervals=IntervalList([]))

        job_intervals = obj.intervals

        for job_id, job in jobs.iteritems():
            df = job.data()
            if df is None:
                continue
            dfs_from_jobs.append(df)

            # evaluate job time interval extents

            # Handle case where data returned slightly smaller than base query.
            # This can be caused because data doesn't always align with query
            # edges, but as a result any future queries of the same requested
            # time interval would return the same results.

            # if our new interval is entirely within the original query,
            # and the deltas at each of the edges is less than the resolution
            # of the table, then we got the best we could hope for,
            # and we should just add the interval as originally requested.
            job_criteria_start = job.criteria.starttime
            job_criteria_end = job.criteria.endtime

            job_data_start = df[self.time_col].min().to_datetime()
            job_data_end = df[self.time_col].max().to_datetime()

            if abs(job_data_start - job_criteria_start) < self.resolution:
                interval_start = job_criteria_start
            else:
                interval_start = job_data_start

            if abs(job_criteria_end - job_data_end) < self.resolution:
                interval_end = job_criteria_end
            else:
                interval_end = job_data_end

            logger.debug('Job time intervals: Criteria - (%s, %s), '
                         'Data - (%s, %s)' %
                         (job_criteria_start, job_criteria_end, job_data_start,
                          job_data_end))

            interval = TimeInterval(interval_start, interval_end)
            logger.debug('Appending TimeInterval: %s' % interval)

            job_intervals += interval

        if not dfs_from_jobs:
            return QueryComplete(None)

        if self.table.options.override_index:
            index = self.table.options.override_index
        else:
            index = make_index(self.ds_table.namespace)

        storage.write(index=index,
                      doctype=self.handle,
                      data_frame=pandas.concat(dfs_from_jobs,
                                               ignore_index=True),
                      timecol=self.time_col,
                      id_method=self.table.options.id_method)

        obj.intervals = self._converge_adjacent(job_intervals)
        obj.tzinfo = self.job.criteria.starttime.tzinfo

        # Only update existing intervals if writing to db succeeds
        obj.save()

        # Immediately reading from db after writing results can result in
        # non-correct data, thus stitching the data frames together
        total_df = pandas.concat(dfs_from_db + dfs_from_jobs,
                                 ignore_index=True)
        data = total_df.sort(self.time_col).drop_duplicates()

        return QueryComplete(data)
Exemplo n.º 30
0
 def collect(self, jobs):
     df = jobs['ts'].data()
     return QueryComplete(df)