class Column(models.Model): table = models.ForeignKey(Table) name = models.CharField(max_length=30) label = models.CharField(max_length=30, null=True) position = models.IntegerField() options = PickledObjectField() iskey = models.BooleanField(default=False) isnumeric = models.BooleanField(default=True) synthetic = models.BooleanField(default=False) # Ephemeral columns are columns added to a table at run-time ephemeral = models.BooleanField(default=False) compute_post_resample = models.BooleanField(default=False) compute_expression = models.CharField(max_length=300) resample_operation = models.CharField(max_length=300, default='sum') # datatype should be an enumeration: # metric, bytes, time XXXCJ make enumeration datatype = models.CharField(max_length=50, default='') units = models.CharField(max_length=50, default='') def __unicode__(self): return self.label def save(self, *args, **kwargs): if self.label is None: self.label = self.name super(Column, self).save() @classmethod def create(cls, table, name, label=None, datatype='', units='', iskey=False, issortcol=False, options=None, **kwargs): if len(Column.objects.filter(table=table, name=name)) > 0: raise ValueError("Column %s already in use for table %s" % (name, str(table))) c = Column(table=table, name=name, label=label, datatype=datatype, units=units, iskey=iskey, options=options, **kwargs) posmax = Column.objects.filter(table=table).aggregate(Max('position')) c.position = (posmax['position__max'] or 0) + 1 c.save() if issortcol: table.sortcol = c table.save() return c
class Widget(models.Model): """ Defines a UI widget and the source datatables """ tables = models.ManyToManyField(Table) section = models.ForeignKey(Section) title = models.CharField(max_length=100) row = models.IntegerField() col = models.IntegerField() width = models.IntegerField(default=1) height = models.IntegerField(default=300) rows = models.IntegerField(default=-1) options = PickledObjectField() module = models.CharField(max_length=100) uiwidget = models.CharField(max_length=100) uioptions = PickledObjectField() objects = InheritanceManager() def __repr__(self): return '<Widget %s (%s)>' % (self.title, self.id) def __unicode__(self): return '<Widget %s (%s)>' % (self.title, self.id) def widgettype(self): return 'rvbd_%s.%s' % (self.module.split('.')[-1], self.uiwidget) def table(self, i=0): return self.tables.all()[i] def compute_row_col(self): rowmax = self.section.report.widgets().aggregate(Max('row')) row = rowmax['row__max'] if row is None: row = 1 col = 1 else: widthsum = self.section.report.widgets().filter(row=row).aggregate(Sum('width')) width = widthsum['width__sum'] if width + self.width > 12: row = row + 1 col = 1 else: col = width + 1 self.row = row self.col = col def criteria_from_form(self, form): """ Extract POST style criteria data from form. """ fields_by_section = self.section.report.collect_fields_by_section() common_fields = fields_by_section[0] section_fields = fields_by_section[self.section.id] # Reverse the process of adding the prefix to SECTION-level criteria. # If a field is in section_fields, the id has the prefix, just use # the original keyword in the returned fields fields = {} for k, v in form.as_text().iteritems(): if k in common_fields: fields[common_fields[k].keyword] = v elif k in section_fields: fields[section_fields[k].keyword] = v return fields def collect_fields(self): # Gather up all fields fields = SortedDict() # All fields attached to the section's report for f in self.section.report.fields.all().order_by('id'): fields[f.keyword] = f # All fields attached to the section for f in self.section.fields.all().order_by('id'): fields[f.keyword] = f # All fields attached to any Widget's Tables for t in self.tables.all(): for f in t.fields.all().order_by('id'): fields[f.keyword] = f return fields
class TableField(models.Model): """ Defines a single field associated with a table. TableFields define the the parameters that are used by a Table at run time. The Table.fields attribute associates one or more fields with the table. At run time, a Criteria object binds values to each field. The Criteria object has an attribute matching each associated TableField keyword. When defining a TableField, the following model attributes may be specified: :param keyword: short identifier used like a variable name, this must be unique per table :param label: text label displayed in user interfaces :param help_text: descriptive help text associated with this field :param initial: starting or default value to use in user interfaces :param required: boolean indicating if a non-null values must be provided :param hidden: boolean indicating if this field should be hidden in user interfaces, usually true when the value is computed from other fields via post_process_func or post_process_template :param field_cls: Django Form Field class to use for rendering. If not specified, this defaults to CharField :param field_kwargs: Dictionary of additional field specific kwargs to pass to the field_cls constructor. :param parants: List of parent keywords that this field depends on for a final value. Used in conjunction with either post_process_func or post_process_template. :param pre_process_func: Function to call to perform any necessary preprocessing before rendering a form field or accepting user input. :param post_process_func: Function to call to perform any post submit processing. This may be additional value cleanup or computation based on other form data. :param post_process_template: Simple string format style template to fill in based on other form criteria. """ keyword = models.CharField(max_length=100) label = models.CharField(max_length=100, null=True, default=None) help_text = models.CharField(blank=True, null=True, default=None, max_length=400) initial = PickledObjectField(blank=True, null=True) required = models.BooleanField(default=False) hidden = models.BooleanField(default=False) field_cls = PickledObjectField(null=True) field_kwargs = PickledObjectField(blank=True, null=True) parent_keywords = SeparatedValuesField(null=True) pre_process_func = FunctionField(null=True) dynamic = models.BooleanField(default=False) post_process_func = FunctionField(null=True) post_process_template = models.CharField(null=True, max_length=500) @classmethod def create(cls, keyword, label=None, obj=None, **kwargs): parent_keywords = kwargs.pop('parent_keywords', None) if parent_keywords is None: parent_keywords = [] field = cls(keyword=keyword, label=label, **kwargs) field.save() if field.post_process_template is not None: f = string.Formatter() for (_, parent_keyword, _, _) in f.parse(field.post_process_template): if parent_keyword is not None: parent_keywords.append(parent_keyword) field.parent_keywords = parent_keywords field.save() if obj is not None: obj.fields.add(field) return field def __unicode__(self): return "<TableField %s (%s)>" % (self.keyword, self.id) def __repr__(self): return unicode(self) def is_report_criteria(self, table): """ Runs through intersections of widgets to determine if this criteria is applicable to the passed table report <--> widgets <--> table | L- TableField (self) """ wset = set(table.widget_set.all()) rset = set(self.report_set.all()) return any( wset.intersection(set(rwset.widget_set.all())) for rwset in rset) @classmethod def find_instance(cls, key): """ Return instance given a keyword. """ params = TableField.objects.filter(keyword=key) if len(params) == 0: return None elif len(params) > 1: raise KeyError("Multiple TableField matches found for %s" % key) param = params[0] return param
class Job(models.Model): # Timestamp when the job was created created = models.DateTimeField(auto_now_add=True) # Timestamp the last time the job was accessed touched = models.DateTimeField(auto_now_add=True) # Number of references to this job refcount = models.IntegerField(default=0) # Whether this job is a child of another job ischild = models.BooleanField(default=False) # If ischild, this points to the parent job parent = models.ForeignKey('self', null=True) # Table associated with this job table = models.ForeignKey(Table) # Criteria used to start this job - an instance of the Criteria class criteria = PickledObjectField(null=True) # Actual criteria as returned by the job after running actual_criteria = PickledObjectField(null=True) # Unique handle for the job handle = models.CharField(max_length=100, default="") # Job status NEW = 0 RUNNING = 1 COMPLETE = 3 ERROR = 4 status = models.IntegerField(default=NEW, choices=((NEW, "New"), (RUNNING, "Running"), (COMPLETE, "Complete"), (ERROR, "Error"))) # Message if job complete or error message = models.TextField(default="") # While RUNNING, this provides an indicator of progress 0-100 progress = models.IntegerField(default=-1) # While RUNNING, time remaining remaining = models.IntegerField(default=None, null=True) def __unicode__(self): return "<Job %s (%8.8s) - t%s>" % (self.id, self.handle, self.table.id) def __repr__(self): return unicode(self) def refresh(self): """ Refresh dynamic job parameters from the database. """ job = Job.objects.get(pk=self.pk) for k in [ 'status', 'message', 'progress', 'remaining', 'actual_criteria', 'touched', 'refcount' ]: setattr(self, k, getattr(job, k)) @transaction.commit_on_success def safe_update(self, **kwargs): """ Update the job with the passed dictionary in a database safe way. This method updates only the requested paraemters and refreshes the rest from the database. This should be used for all updates to Job's to ensure that unmodified keys are not accidentally clobbered by doing a blanket job.save(). """ if kwargs is None: return with LocalLock(): logger.debug("%s safe_update %s" % (self, kwargs)) Job.objects.filter(pk=self.pk).update(**kwargs) # Force a reload of the job to get latest data self.refresh() if not self.ischild: # Push changes to children of this job child_kwargs = {} for k, v in kwargs.iteritems(): if k in [ 'status', 'message', 'progress', 'remaining', 'actual_criteria' ]: child_kwargs[k] = v # There should be no recursion, so a direct update to the # database is possible. (If recursion, would need to call # self_update() on each child.) Job.objects.filter(parent=self).update(**child_kwargs) @classmethod def create(cls, table, criteria): with LocalLock(): with transaction.commit_on_success(): # Grab a lock on the row associated with the table table = Table.objects.select_for_update().get(id=table.id) # Lockdown start/endtimes try: criteria.compute_times() except ValueError: # Ignore errors, this table may not have start/end times pass # Compute the handle -- this will take into account # cacheability handle = Job._compute_handle(table, criteria) # Look for another job by the same handle in any state # except ERROR if not criteria.ignore_cache: parents = (Job.objects.select_for_update().filter( status__in=[Job.NEW, Job.COMPLETE, Job.RUNNING], handle=handle, ischild=False).order_by('created')) else: parents = None if parents is not None and len(parents) > 0: parent = parents[0] job = Job(table=table, criteria=criteria, actual_criteria=parent.actual_criteria, status=parent.status, handle=handle, parent=parent, ischild=True, progress=parent.progress, remaining=parent.remaining, message='') job.save() parent.reference("Link from job %s" % job) now = datetime.datetime.now(tz=pytz.utc) parent.safe_update(touched=now) logger.info( "%s: New job for table %s, linked to parent %s" % (job, table.name, parent)) else: job = Job(table=table, criteria=criteria, status=Job.NEW, handle=handle, parent=None, ischild=False, progress=0, remaining=-1, message='') job.save() logger.info("%s: New job for table %s" % (job, table.name)) logger.debug("%s: criteria = %s" % (job, criteria)) # Flush old jobs Job.age_jobs() return job @classmethod def _compute_handle(cls, table, criteria): h = hashlib.md5() h.update(str(table.id)) if table.cacheable and not criteria.ignore_cache: # XXXCJ - Drop ephemeral columns when computing the cache handle, # since the list of columns is modifed at run time. Typical use # case is an analysis table which creates a time-series graph of # the top 10 hosts -- one column per host. The host columns will # change based on the run of the dependent table. # # Including epheremal columns causes some problems because the # handle is computed before the query is actually run, so it never # matches. # # May want to dig in to this further and make sure this doesn't # pick up cache files when we don't want it to h.update('.'.join([c.name for c in table.get_columns()])) if table.criteria_handle_func: criteria = table.criteria_handle_func.function(criteria) for k, v in criteria.iteritems(): #logger.debug("Updating hash from %s -> %s" % (k,v)) h.update('%s:%s' % (k, v)) else: # Table is not cacheable, instead use current time plus a random # value just to get a unique hash h.update(str(datetime.datetime.now())) h.update(str(random.randint(0, 10000000))) return h.hexdigest() def reference(self, message=""): pk = self.pk Job.objects.filter(pk=pk).update(refcount=F('refcount') + 1) #logger.debug("%s: reference(%s) @ %d" % # (self, message, Job.objects.get(pk=pk).refcount)) def dereference(self, message=""): pk = self.pk Job.objects.filter(pk=pk).update(refcount=F('refcount') - 1) #logger.debug("%s: dereference(%s) @ %d" % # (self, message, Job.objects.get(pk=pk).refcount)) def get_columns(self, ephemeral=None, **kwargs): """ Return columns assocated with the table for the job. The returned column set includes ephemeral columns associated with this job unless ephemeral is set to False. """ if ephemeral is None: kwargs['ephemeral'] = self.parent or self return self.table.get_columns(**kwargs) def json(self, data=None): """ Return a JSON represention of this Job. """ return { 'id': self.id, 'handle': self.handle, 'progress': self.progress, 'remaining': self.remaining, 'status': self.status, 'message': self.message, 'data': data } def combine_filterexprs(self, joinstr="and", exprs=None): self.refresh() criteria = self.criteria if exprs is None: exprs = [] elif type(exprs) is not list: exprs = [exprs] exprs.append(self.table.filterexpr) nonnull_exprs = [] for e in exprs: if e != "" and e is not None: nonnull_exprs.append(e) if len(nonnull_exprs) > 1: return "(" + (") " + joinstr + " (").join(nonnull_exprs) + ")" elif len(nonnull_exprs) == 1: return nonnull_exprs[0] else: return "" def start(self): """ Start this job. """ self.refresh() if self.ischild: logger.debug("%s: Shadowing parent job %s" % (self, self.parent)) return with transaction.commit_on_success(): logger.debug("%s: Starting job" % str(self)) self.mark_progress(0) logger.debug("%s: Worker to run report" % str(self)) # Lookup the query class for this table i = importlib.import_module(self.table.module) queryclass = i.TableQuery # Create an worker to do the work worker = Worker(self, queryclass) worker.start() def mark_error(self, message): logger.warning("%s failed: %s" % (self, message)) self.safe_update(status=Job.ERROR, progress=100, message=message) def mark_complete(self): logger.info("%s complete" % self) self.safe_update(status=Job.COMPLETE, progress=100, message='') def mark_progress(self, progress, remaining=None): logger.debug("%s progress %s" % (self, progress)) self.safe_update(status=Job.RUNNING, progress=progress, remaining=remaining) def datafile(self): """ Return the data file for this job. """ return os.path.join(settings.DATA_CACHE, "job-%s.data" % self.handle) def data(self): """ Returns a pandas.DataFrame of data, or None if not available. """ with transaction.commit_on_success(): self.refresh() if not self.status == Job.COMPLETE: raise ValueError("Job not complete, no data available") self.reference("data()") e = None try: logger.debug("%s looking for data file: %s" % (str(self), self.datafile())) if os.path.exists(self.datafile()): df = pandas.load(self.datafile()) logger.debug("%s data loaded %d rows from file: %s" % (str(self), len(df), self.datafile())) else: logger.debug("%s no data, missing data file: %s" % (str(self), self.datafile())) df = None except Exception as e: logger.error("Error loading datafile %s for %s" % (self.datafile(), str(self))) logger.error("Traceback:\n%s" % e) finally: self.dereference("data()") if e: raise e return df def values(self): """ Return data as a list of lists. """ df = self.data() if df is not None: # Replace NaN with None df = df.where(pandas.notnull(df), None) # Extract tha values in the right order all_columns = self.get_columns() all_col_names = [c.name for c in all_columns] # Straggling numpy data types may cause problems # downstream (json encoding, for example), so strip # things down to just native ints and floats vals = [] for row in df.ix[:, all_col_names].itertuples(): vals_row = [] for v in row[1:]: if (isinstance(v, numpy.number) or isinstance(v, numpy.bool_)): v = numpy.asscalar(v) vals_row.append(v) vals.append(vals_row) else: vals = [] return vals @classmethod def age_jobs(cls, old=None, ancient=None, force=False): """ Delete old jobs that have no refcount and all ancient jobs. """ # Throttle - only run this at most once every 15 minutes global age_jobs_last_run if not force and time.time() - age_jobs_last_run < 60 * 15: return age_jobs_last_run = time.time() if old is None: old = datetime.timedelta( seconds=settings.APPS_DATASOURCE['job_age_old_seconds']) elif type(old) in [int, float]: old = datetime.timedelta(seconds=old) if ancient is None: ancient = datetime.timedelta( seconds=settings.APPS_DATASOURCE['job_age_ancient_seconds']) elif type(ancient) in [int, float]: ancient = datetime.timedelta(seconds=ancient) with transaction.commit_on_success(): # Ancient jobs are deleted regardless of refcount now = datetime.datetime.now(tz=pytz.utc) try: qs = (Job.objects.select_for_update().filter(touched__lte=now - ancient)) if len(qs) > 0: logger.info('Deleting %d ancient jobs ...' % len(qs)) qs.delete() except: logger.exception("Failed to delete ancient jobs") # Old jobs are deleted only if they have a refcount of 0 try: qs = (Job.objects.select_for_update().filter(touched__lte=now - old, refcount=0)) if len(qs) > 0: logger.info('Deleting %d old jobs ...' % len(qs)) qs.delete() except: logger.exception("Failed to delete old jobs") @classmethod def flush_incomplete(cls): jobs = Job.objects.filter(progress__lt=100) logger.info("Flushing %d incomplete jobs: %s" % (len(jobs), [j.id for j in jobs])) jobs.delete() def done(self): self.refresh() logger.debug("%s status: %s - %s%%" % (str(self), self.status, self.progress)) return self.status == Job.COMPLETE or self.status == Job.ERROR
class Table(models.Model): name = models.CharField(max_length=200) module = models.CharField(max_length=200) # source module name sortcol = models.ForeignKey('Column', null=True, related_name='Column') rows = models.IntegerField(default=-1) filterexpr = models.CharField(null=True, max_length=400) # resample flag -- resample to the criteria.resolution # - this requires a "time" column resample = models.BooleanField(default=False) # options are typically fixed attributes defined at Table creation options = PickledObjectField() # list of fields that must be bound to values in criteria # that this table needs to run fields = models.ManyToManyField(TableField, null=True) # Default values for fields assocaited with this table, these # may be overridden by user criteria at run time criteria = PickledObjectField() # Function to call to tweak criteria for computing a job handle. # This must return a dictionary of key/value pairs of values # to use for computing a determining when a job must be rerun. criteria_handle_func = FunctionField(null=True) # Indicates if data can be cached cacheable = models.BooleanField(default=True) @classmethod def create(cls, name, module, **kwargs): t = Table(name=name, module=module, **kwargs) t.save() return t def __unicode__(self): return "<Table %s (%s)>" % (str(self.id), self.name) def __repr__(self): return unicode(self) def get_columns(self, synthetic=None, ephemeral=None, iskey=None): """ Return the list of columns for this table. `synthetic` is tri-state: None (default) is don't care, True means only synthetic columns, False means only non-synthetic columns `ephemeral` is a job reference. If specified, include ephemeral columns related to this job `iskey` is tri-state: None (default) is don't care, True means only key columns, False means only non-key columns """ filtered = [] for c in Column.objects.filter(table=self).order_by('position'): if synthetic is not None and c.synthetic != synthetic: continue if c.ephemeral is not None and c.ephemeral != ephemeral: continue if iskey is not None and c.iskey != iskey: continue filtered.append(c) return filtered def copy_columns(self, table, columns=None, except_columns=None): """ Copy the columns from `table` into this table. This method will copy all the columsn from another table, including all attributes as well as sorting. """ posmax = Column.objects.filter(table=table).aggregate(Max('position')) pos = (posmax['position__max'] or 0) + 1 for c in table.get_columns(): if columns is not None and c.name not in columns: continue if except_columns is not None and c.name in except_columns: continue issortcol = (c == c.table.sortcol) c.pk = None c.table = self c.position = pos pos = pos + 1 c.save() if issortcol: self.sortcol = c self.save() def compute_synthetic(self, job, df): """ Compute the synthetic columns from DF a two-dimensional array of the non-synthetic columns. Synthesis occurs as follows: 1. Compute all synthetic columns where compute_post_resample is False 2. If the table is a time-based table with a defined resolution, the result is resampled. 3. Any remaining columns are computed. """ if df is None: return None all_columns = job.get_columns() all_col_names = [c.name for c in all_columns] def compute(df, syncols): #logger.debug("Compute: syncol = %s" % ([c.name for c in syncols])) for syncol in syncols: expr = syncol.compute_expression g = tokenize.generate_tokens(StringIO(expr).readline) newexpr = "" getvalue = False getclose = False for ttype, tvalue, _, _, _ in g: if getvalue: if ttype != tokenize.NAME: msg = "Invalid syntax, expected {name}: %s" % tvalue raise ValueError(msg) elif tvalue not in all_col_names: raise ValueError("Invalid column name: %s" % tvalue) newexpr += "df['%s']" % tvalue getclose = True getvalue = False elif getclose: if ttype != tokenize.OP and tvalue != "}": msg = "Invalid syntax, expected {name}: %s" % tvalue raise ValueError(msg) getclose = False elif ttype == tokenize.OP and tvalue == "{": getvalue = True else: newexpr += tvalue df[syncol.name] = eval(newexpr) # 1. Compute synthetic columns where post_resample is False compute(df, [ col for col in all_columns if (col.synthetic and col.compute_post_resample is False) ]) # 2. Resample colmap = {} timecol = None for col in all_columns: colmap[col.name] = col if col.datatype == "time": timecol = col.name if self.resample: if timecol is None: raise (TableComputeSyntheticError( "Table %s 'resample' is set but no 'time' column'" % self)) if (('resolution' not in job.criteria) and ('resample_resolution' not in job.criteria)): raise (TableComputeSyntheticError( ("Table %s 'resample' is set but criteria missing " + "'resolution' or 'resample_resolution'") % self)) how = {} for k in df.keys(): if k == timecol: continue how[k] = colmap[k].resample_operation indexed = df.set_index(timecol) if 'resample_resolution' in job.criteria: resolution = job.criteria.resample_resolution else: resolution = job.criteria.resolution resolution = timedelta_total_seconds(resolution) if resolution < 1: raise (TableComputeSyntheticError( ("Table %s cannot resample at a resolution " + "less than 1 second") % self)) logger.debug('%s: resampling to %ss' % (self, int(resolution))) indexed.save('/tmp/indexed.pd') resampled = indexed.resample('%ss' % int(resolution), how, convention='end').reset_index() df = resampled # 3. Compute remaining synthetic columns (post_resample is True) compute(df, [ c for c in all_columns if (c.synthetic and c.compute_post_resample is True) ]) return df