def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
class Digest: def __init__(self): self.digest = TDigest() self.digest.update(0) self._count = 0 self.lock = asyncio.Lock() def add(self, v): self.digest.update(v) self._count += 1 def percentile(self, v): return self.digest.percentile(v) def count(self): return self._count
def __init__(self, column, percentiles=None, **kwds): if not column: raise ProgressiveError('Need a column name') self._add_slots(kwds,'input_descriptors', [SlotDescriptor('df', type=pd.DataFrame)]) super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds) self._column = column self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < .5] uh = percentiles[percentiles > .5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles] self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC) self._df = create_dataframe(self.schema)
def __init__(self, config, logger, options): super(SeasonalDecomposition, self).__init__(config, logger, resource={'metric_sink': 'RedisSink', 'output_sink': 'GraphiteSink'}) self.plugin = options['plugin'] self.service = options['service'] self.params = options['params'] self.tdigest_key = 'td:%s' % self.service self.td = TDigest() self.error_eval = { 'tukey': self._eval_tukey, 'quantile': self._eval_quantile }
def detect_anomalies(self, data, anomaly_fraction): data = np.asanyarray(data) if len(data.shape) == 1: data = data[:, np.newaxis] signal = self.reconstruct_signal(data) digest = TDigest() n = data.shape[0] delta = np.zeros(data.shape) for i in xrange(n): error = self.compute_error(data[i, :], signal[i, :]) delta[i, :] = error digest.update(np.abs(error)) threshold = digest.quantile(1 - anomaly_fraction) anomalies = [] for i in xrange(n): element = delta[i] if np.abs(element) > threshold: anomalies.append(Anomaly(data[i], element, i)) return anomalies
def reset(self) -> None: self.tdigest = TDigest()
def __init__(self): self.digest = TDigest() self.digest.update(0) self._count = 0 self.lock = asyncio.Lock()
class Percentiles(DataFrameModule): parameters = [('percentiles', object, [0.25, 0.5, 0.75]), ('history', np.dtype(int), 3)] def __init__(self, column, percentiles=None, **kwds): if not column: raise ProgressiveError('Need a column name') self._add_slots(kwds,'input_descriptors', [SlotDescriptor('df', type=pd.DataFrame)]) super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds) self._column = column self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < .5] uh = percentiles[percentiles > .5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles] self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC) self._df = create_dataframe(self.schema) def is_ready(self): if self.get_input_slot('df').has_created(): return True return super(Percentiles, self).is_ready() def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: # Series is not numeric or is all NaNs. return None logger.debug("column_summary - " + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ["mean", "min", "max", "std", "sum"]: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult["n"] = column_props[col]["notnulls"] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult["percentiles"] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult["median"] = colresult["percentiles"][50] colresult["iqr"] = (colresult["percentiles"][75] - colresult["percentiles"][25]) # Compute the t-digest. logger.debug("column_summary - {} - creating TDigest...".format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug("column_summary - {} - testing log trans...".format(col)) try: colresult["logtrans"] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning("test_logtrans has failed for column `{}`: {}".format( col, e)) colresult["logtrans"] = False if colresult["logtrans"]: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult["logtrans_mean"] = _tdigest_mean(logdigest) colresult["logtrans_std"] = _tdigest_std(logdigest) colresult["logtrans_IQR"] = logdigest.percentile( 75) - logdigest.percentile(25) logger.debug("column_summary - {} - should {}be log-transformed".format( col, "NOT " if not colresult["logtrans"] else "")) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug("column_summary - {} - computing histogram...".format(col)) if column_props[col]["is_categorical"]: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult["logtrans"]: counts, log_edges = np.histogram(np.log10(data), density=False, bins="fd") edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins="fd") colresult["histogram"] = { "counts": counts.tolist(), "bin_edges": edges.tolist(), } # Compute KDE logger.debug("column_summary - {} - computing KDE...".format(col)) bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) if column_props[col]["is_categorical"]: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult["min"], colresult["max"] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult["logtrans"]) colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} return {col: colresult, "_columns": [col]}
def ingestRecordsMultiProc(dimensionsMetrics, dimensionsEvents, args): ## Register sigint handler signal.signal(signal.SIGINT, signalHandlerMultiProc) numHosts = len(dimensionsMetrics) remainder = numHosts % args.processes startId = 0 ingestionStart = timer() for processId in range(1, args.processes + 1): endId = startId + int( numHosts / args.processes) + (1 if remainder > 0 else 0) if endId > numHosts: print( "Number of processes more than number of hosts, skipping process creation" ) break print("Starting process {} with host ranges: [{}, {}]".format( processId, startId, endId - 1)) ## Select a subset of hosts dimensionsMetricsLocal = dimensionsMetrics[startId:endId] dimensionsMetricsSet = set() for dim in dimensionsMetricsLocal: dimensionsMetricsSet.add( (dim.region, dim.cell, dim.silo, dim.availability_zone, dim.microservice_name, dim.instance_name)) dimensionsEventsLocal = list() ## Select the dimension events for the hosts selected above. for dim in dimensionsEvents: host = (dim.region, dim.cell, dim.silo, dim.availability_zone, dim.microservice_name, dim.instance_name) if host in dimensionsMetricsSet: dimensionsEventsLocal.append(dim) print( "Starting process {} with host ranges: [{}, {}]. Metrics: {}. Events: {}" .format(processId, startId, endId - 1, len(dimensionsMetricsLocal), len(dimensionsEventsLocal))) lowUtilizationHosts, highUtilizationHosts = initializeHighAndLowUtilizationHosts( len(dimensionsMetricsLocal)) parentConn, childConn = multiprocessing.Pipe() manager = multiprocessing.Manager() event = manager.Event() process = MultiProcessIngestWorker( processId, args, dimensionsMetricsLocal, dimensionsEventsLocal, highUtilizationHosts, lowUtilizationHosts, childConn, event) process.start() processes.append((process, parentConn, event)) remainder -= 1 startId = endId success = 0 count = 0 totalLatency = 0.0 aggregatedDigests = TDigest() pooledVariance = 0.0 for p, conn, event in processes: output = conn.recv() p.join() if output == None: continue success += output.success ## Pool the variance. if count == 0: pooledVariance = output.variance else: pooledVariance = ((count - 1) * pooledVariance + (output.count - 1) * output.variance) / ( (count - 1) + (output.count - 1)) count += output.count aggregatedDigests += output.digest totalLatency += output.sum print( "[OVERALL] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}" .format(count, success, round(totalLatency / count, 3), round(math.sqrt(pooledVariance), 3), round(aggregatedDigests.percentile(50), 3), round(aggregatedDigests.percentile(90), 3), round(aggregatedDigests.percentile(99), 3))) ingestionEnd = timer() print("Total time to ingest: {:,} seconds".format( round(ingestionEnd - ingestionStart, 2)))
def __init__(self): super().__init__() self.digest = TDigest()
class BSketch: """BSketch: binning sketch for numerical values and binary target. Parameters ---------- sketch : str, optional (default="gk") Sketch algorithm. Supported algorithms are "gk" (Greenwald-Khanna's) and "t-digest" (Ted Dunning) algorithm. Algorithm "t-digest" relies on `tdigest <https://github.com/CamDavidsonPilon/tdigest>`_. eps : float (default=0.01) Relative error epsilon. K : int (default=25) Parameter excess growth K to compute compress threshold in t-digest. special_codes : array-like or None, optional (default=None) List of special codes. Use special codes to specify the data values that must be treated separately. """ def __init__(self, sketch="gk", eps=0.01, K=25, special_codes=None): self.sketch = sketch self.eps = eps self.K = K self.special_codes = special_codes _check_parameters(sketch, eps, K, special_codes) self._count_missing_e = 0 self._count_missing_ne = 0 self._count_special_e = 0 self._count_special_ne = 0 if sketch == "gk": self._sketch_e = GK(eps) self._sketch_ne = GK(eps) elif sketch == "t-digest": self._sketch_e = TDigest(eps, K) self._sketch_ne = TDigest(eps, K) def add(self, x, y, check_input=False): """Add arrays to the sketch. Parameters ---------- x : array-like, shape = (n_samples,) Training vector, where n_samples is the number of samples. y : array-like, shape = (n_samples,) Target vector relative to x. check_input : bool (default=False) Whether to check input arrays. """ xc, yc, xm, ym, xs, ys, _, _, _, _, _, _, _ = split_data( dtype=None, x=x, y=y, special_codes=self.special_codes, check_input=check_input) # Add values to sketch mask = yc == 1 if self.sketch == "gk": for v1 in xc[mask]: self._sketch_e.add(v1) for v0 in xc[~mask]: self._sketch_ne.add(v0) if self.sketch == "t-digest": self._sketch_e.batch_update(xc[mask]) self._sketch_ne.batch_update(xc[~mask]) # Keep track of missing and special counts n_missing = len(ym) if n_missing: self._count_missing_e += np.count_nonzero(ym == 1) self._count_missing_ne += np.count_nonzero(ym == 0) n_special = len(ys) if n_special: self._count_special_e += np.count_nonzero(ys == 1) self._count_special_ne += np.count_nonzero(ys == 0) def bins(self, splits): """Event and non-events counts for each bin given a list of split points. Parameters ---------- splits : array-like, shape = (n_splits,) List of split points. Returns ------- bins : tuple of arrays of size n_splits + 1. """ n_bins = len(splits) + 1 bins_e = np.zeros(n_bins).astype(np.int64) bins_ne = np.zeros(n_bins).astype(np.int64) indices_e, count_e = self._indices_count(self._sketch_e, splits) indices_ne, count_ne = self._indices_count(self._sketch_ne, splits) for i in range(n_bins): bins_e[i] = count_e[(indices_e == i)].sum() bins_ne[i] = count_ne[(indices_ne == i)].sum() return bins_e, bins_ne def merge(self, bsketch): """Merge current instance with another BSketch instance. Parameters ---------- bsketch : object BSketch instance. """ if not self._mergeable(bsketch): raise Exception("bsketch does not share signature.") if bsketch._sketch_e.n == 0 and bsketch._sketch_ne.n == 0: return if self._sketch_e.n == 0 and self._sketch_ne.n == 0: self._copy(bsketch) return # Merge sketches if self.sketch == "gk": self._sketch_e.merge(bsketch._sketch_e) self._sketch_ne.merge(bsketch._sketch_ne) elif self.sketch == "t-digest": self._sketch_e += bsketch._sketch_e self._sketch_ne += bsketch._sketch_ne # Merge missing and special counts self._count_missing_e += bsketch._count_missing_e self._count_missing_ne += bsketch._count_missing_ne self._count_special_e += bsketch._count_special_e self._count_special_ne += bsketch._count_special_ne def merge_sketches(self): """Merge event and non-event data internal sketches.""" if self.sketch == "gk": new_sketch = GK(self.eps) new_sketch.merge(self._sketch_e) new_sketch.merge(self._sketch_ne) else: new_sketch = self._sketch_e + self._sketch_ne return new_sketch def _copy(self, bsketch): self._sketch_e = bsketch._sketch_e self._sketch_ne = bsketch._sketch_ne # Merge missing and special counts self._count_missing_e = bsketch._count_missing_e self._count_missing_ne = bsketch._count_missing_ne self._count_special_e = bsketch._count_special_e self._count_special_ne = bsketch._count_special_ne def _indices_count(self, sketch, splits): values = np.zeros(len(sketch)) count = np.zeros(len(sketch)) if self.sketch == "gk": for i, entry in enumerate(sketch.entries): values[i] = entry.value count[i] = entry.g elif self.sketch == "t-digest": for i, key in enumerate(sketch.C.keys()): centroid = sketch.C.get_value(key) values[i] = centroid.mean count[i] = centroid.count indices = np.searchsorted(splits, values, side='left') return indices, count def _mergeable(self, other): special_eq = True if self.special_codes is not None and other.special_codes is not None: special_eq = set(self.special_codes) == set(other.special_codes) return (self.sketch == other.sketch and self.eps == other.eps and self.K == other.K and special_eq) @property def n_event(self): """Event count. Returns ------- n_event : int """ count = self._sketch_e.n return count + self._count_missing_e + self._count_special_e @property def n_nonevent(self): """Non-event count. Returns ------- n_nonevent : int """ count = self._sketch_ne.n return count + self._count_missing_ne + self._count_special_ne @property def n(self): """Records count. Returns ------- n : int """ return self.n_event + self.n_nonevent
def initialise_digest(v): d = TDigest() d.update(v) return d
def run(self): global lock global seriesId global timestamp with lock: ## Randomly pick a series ID to start for this process. seriesId = random.randint( 0, len(self.dimensionEvents) + len(self.dimensionMetrics) - 1) timestamp = getTimestampMillis() print("Process {} using start series ID: {}".format( self.processId, seriesId)) ## Register sigint handler signal.signal(signal.SIGINT, signalHandler) overallSummary = None ingestionStart = timer() try: for threadId in range(self.args.concurrency): threadIdStr = "{}-{}".format(self.processId, threadId + 1) print("Starting ThreadId: {}".format(threadIdStr)) thread = IngestionThread(threadIdStr, self.args, self.dimensionMetrics, self.dimensionEvents, self.highUtilizationHosts, self.lowUtilizationHosts, self.event) thread.start() self.threads.append(thread) success = 0 count = 0 totalLatency = 0.0 aggregatedDigests = TDigest() pooledVariance = 0.0 for t in self.threads: t.join() success += t.success ## Pool the variance. if count == 0: pooledVariance = t.variance else: pooledVariance = ((count - 1) * pooledVariance + (t.count - 1) * t.variance) / ( (count - 1) + (t.count - 1)) count += t.count aggregatedDigests += t.digest totalLatency += t.sum print( "[Process: {}] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}" .format(self.processId, count, success, round(totalLatency / count, 3), round(math.sqrt(pooledVariance), 3), round(aggregatedDigests.percentile(50), 3), round(aggregatedDigests.percentile(90), 3), round(aggregatedDigests.percentile(99), 3))) overallSummary = IngestionSummaryStats(aggregatedDigests, count, success, totalLatency, pooledVariance) ingestionEnd = timer() print("Total time to ingest: {:,} seconds".format( round(ingestionEnd - ingestionStart, 2))) finally: self.conn.send(overallSummary)
from tdigest import TDigest consumer = KafkaConsumer('demo-topic', group_id=None, bootstrap_servers='127.0.0.1:9092', value_deserializer=lambda v: json.loads(v)) entity_detectors = {} counter = 0 for msg in consumer: entity_id = msg.value['id'] value = msg.value['value'] if entity_id not in entity_detectors: entity_detectors[entity_id] = TDigest() #Get entity specific anomaly detector detector = entity_detectors[entity_id] #Check if detector is empty if (10 > len(detector)): detector.update(value) continue #Get bounds upp_bound = detector.percentile(99.9) low_bound = detector.percentile(0.1) #Display info if (0 == (counter % 5)):
class IngestionThread(threading.Thread): def __init__(self, threadId, args, dimensionMetrics, dimensionEvents, highUtilizationHosts, lowUtilizationHosts, event): threading.Thread.__init__(self) self.threadId = threadId self.args = args self.dimensionMetrics = dimensionMetrics self.dimensionEvents = dimensionEvents self.client = tswrite.createWriteClient(args.endpoint, profile=args.profile) self.databaseName = args.databaseName self.tableName = args.tableName self.numMetrics = len(dimensionMetrics) self.numEvents = len(dimensionEvents) self.digest = TDigest( ) ## Use the t-digest to compute the streaming percentiles self.count = 0 self.success = 0 self.sum = 0.0 self.variance = float('nan') self.highUtilizationHosts = highUtilizationHosts self.lowUtilizationHosts = lowUtilizationHosts self.sigInt = False self.event = event def run(self): global seriesId global timestamp global lock idx = 0 mean = 0.0 squared = 0.0 while True: with lock: if self.sigInt == True or sigInt == True or self.event.is_set( ): print("Thread {} exiting.".format(self.threadId)) break seriesId += 1 if seriesId >= self.numMetrics + self.numEvents: ## Wrapping around, so move to new timestamp. seriesId = 0 newTimestamp = timestamp + self.args.intervalMillis currentTime = getCurrentTimestampMillis() ## Check if the timestamps are falling behind if newTimestamp < currentTime - 0.05 * self.args.intervalMillis: print( "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes." .format(self.args.intervalMillis, currentTime - timestamp)) ## Move time forward. timestamp = getTimestampMillis() else: timestamp = newTimestamp ## Check if we are ingesting too fast, then slow down. if timestamp > currentTime - 1000: ## Slow down sleepTimeSecs = int( (timestamp - currentTime) / 1000) print("Thread {} sleeping for {} secs".format( self.threadId, sleepTimeSecs)) time.sleep(sleepTimeSecs) now = datetime.datetime.now() print( "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}." .format(self.threadId, now.strftime("%Y-%m-%d %H:%M:%S"), timestamp)) localSeriesId = seriesId localTimestamp = timestamp if localSeriesId < self.numMetrics: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionMetrics[localSeriesId]) records = model.createRandomMetrics(seriesId, localTimestamp, "MILLISECONDS", self.highUtilizationHosts, self.lowUtilizationHosts) else: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionEvents[localSeriesId - self.numMetrics]) records = model.createRandomEvent(localTimestamp, "MILLISECONDS") idx += 1 start = timer() try: writeResult = tswrite.writeRecords(self.client, self.databaseName, self.tableName, commonAttributes, records) self.success += 1 except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) requestId = "RequestId: {}".format( e.response['ResponseMetadata']['RequestId']) print(requestId) print(json.dumps(commonAttributes, indent=2)) print(json.dumps(records, indent=2)) continue finally: self.count += 1 end = timer() cur = end - start self.digest.update(cur) self.sum += cur ## Computing the streaming M^2 (squared distance from mean) delta = cur - mean mean += delta / self.count squared += delta * (cur - mean) if self.count > 1: self.variance = float(squared / (self.count - 1)) requestId = writeResult['ResponseMetadata']['RequestId'] if idx % 1000 == 0: now = datetime.datetime.now() print( "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}" .format(self.threadId, idx, now.strftime("%Y-%m-%d %H:%M:%S"), requestId, round(self.sum / self.count, 3), round(math.sqrt(self.variance), 3), round(self.digest.percentile(50), 3), round(self.digest.percentile(90), 3), round(self.digest.percentile(99), 3))) def interrupt(self): print("Interrupting thread: ", self.threadId) self.sigInt = True
class Percentiles(TableModule): parameters = [ ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]), ("history", np.dtype(int), 3), ] inputs = [SlotDescriptor("table", type=Table)] def __init__(self, column: str, percentiles: Optional[Union[List[float], np.ndarray[Any, Any]]] = None, **kwds: Any) -> None: if not column: raise ProgressiveError("Need a column name") super(Percentiles, self).__init__(**kwds) self._columns = [column] self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): # type: ignore percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < 0.5] uh = percentiles[percentiles > 0.5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self._pername: List[str] = [_pretty_name(x) for x in self._percentiles] dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}" self.result = Table(self.generate_table_name("percentiles"), dshape=dshape, create=True) def is_ready(self) -> bool: slot = self.get_input_slot("table") if slot is not None and slot.created.any(): return True return super(Percentiles, self).is_ready() def reset(self) -> None: self.tdigest = TDigest() @process_slot("table", reset_cb="reset") @run_if_any def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: dfslot = ctx.table indices = dfslot.created.next(length=step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x[0]) df = self.table values = {} for n, p in zip(self._pername, self._percentiles): values[n] = self.tdigest.percentile(p * 100) df.add(values) # with self.lock: # df.loc[run_number] = values # if len(df) > self.params.history: # self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def digest_partitions(values): digest = TDigest() digest.batch_update(values) return [digest]
def __init__(self,**kwargs): super().__init__(**kwargs) self.digest = TDigest()
def find_equalization_params(self, batch, component, survey_id_col, sample_size=10000, container_name='equal_params', **kwargs): """ Estimates 95th percentile of absolute values for each seismic survey in dataset for equalization. This method utilizes t-digest structure for batch-wise estimation of rank-based statistics, namely 95th percentile. Parameters ---------- batch : SeismicBatch or B() named expression. Current batch from pipeline. component : str Component with shot gathers. survey_id_col : str Column in index that indicate names of seismic surveys from different seasons. sample_size: int, optional Number of elements to draw from each shot gather to update estimates if TDigest. Time for each update grows linearly with `sample_size`. Default is 10000. container_name: str, optional Name of the `SeismicDataset` attribute to store a dict with estimated percentile. Also contains `survey_id_col` key and corresponding value. kwargs: misc Parameters for TDigest objects. Raises ------ ValueError : If index is not FieldIndex. ValueError : If shot gather with same id is contained in more than one survey. Note ---- Dictoinary with estimated percentile can be obtained from pipeline using `D(container_name)`. """ if not isinstance(self.index, FieldIndex): raise ValueError("Index must be FieldIndex, not {}".format(type(self.index))) private_name = '_' + container_name params = getattr(self, private_name, None) if params is None: surveys = np.unique(self.index.get_df()[survey_id_col]) delta, k = kwargs.pop('delta', 0.01), kwargs.pop('K', 25) params = dict(zip(surveys, [TDigest(delta, k) for _ in surveys])) setattr(self, private_name, params) for idx in batch.indices: surveys_by_fieldrecord = np.unique(batch.index.get_df(index=idx)[survey_id_col]) if len(surveys_by_fieldrecord) != 1: raise ValueError('Field {} represents data from more than one survey!'.format(idx)) survey = surveys_by_fieldrecord[0] pos = batch.index.get_pos(idx) sample = np.random.choice(getattr(batch, component)[pos].reshape(-1), size=sample_size) params[survey].batch_update(np.absolute(sample)) statistics = dict([survey, digest.percentile(95)] for survey, digest in params.items() if digest.n > 0) statistics['survey_id_col'] = survey_id_col setattr(self, container_name, statistics)
from sim_wallet import simWallet from tdigest import TDigest import time digest = TDigest() wallet = simWallet() wallet.print_wallet() initial_value = wallet.estimate_total() print('Initial wallet value is {} BTC.'.format(initial_value), flush=True) while True: current_price = wallet.update_price() digest.update(current_price) digest_value = digest.percentile(15) print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format( current_price, digest_value), flush=True) if current_price < 0.9 * digest_value: wallet.buy_bnb(1) if current_price > 1.1 * digest_value: wallet.sell_bnb(1)
class SeasonalDecomposition(BaseTask): def __init__(self, config, logger, options): super(SeasonalDecomposition, self).__init__(config, logger, resource={'metric_sink': 'RedisSink', 'output_sink': 'GraphiteSink'}) self.plugin = options['plugin'] self.service = options['service'] self.params = options['params'] self.tdigest_key = 'td:%s' % self.service self.td = TDigest() self.error_eval = { 'tukey': self._eval_tukey, 'quantile': self._eval_quantile } def _eval_quantile(self, error): state = {} alpha = self.params['error_params']['alpha'] lower = self.td.quantile(alpha / 2) upper = self.td.quantile(1 - alpha / 2) if 'minimal_lower_threshold' in self.params['error_params']: lower = max( lower, self.params['error_params']['minimal_lower_threshold']) if 'minimal_upper_threshold' in self.params['error_params']: upper = min( upper, self.params['error_params']['minimal_upper_threshold']) flag = 0 if error > upper: flag = 1 elif error < lower: flag = -1 state['flag'] = flag state['lower'] = lower state['upper'] = upper state['alpha'] = alpha return state def _eval_tukey(self, error): state = {} iqr_scaling = self.params['error_params'].get('iqr_scaling', 1.5) quantile_25 = self.td.quantile(0.25) quantile_75 = self.td.quantile(0.75) iqr = quantile_75 - quantile_25 lower = quantile_25 - iqr_scaling * iqr upper = quantile_75 + iqr_scaling * iqr if 'minimal_lower_threshold' in self.params['error_params']: lower = max( lower, self.params['error_params']['minimal_lower_threshold']) if 'minimal_upper_threshold' in self.params['error_params']: upper = min( upper, self.params['error_params']['minimal_upper_threshold']) flag = 0 if error > upper: flag = 1 elif error < lower: flag = -1 state['flag'] = flag state['lower'] = lower state['upper'] = upper return state def read(self): metric = self.params['metric'] period_length = self.params['period_length'] seasons = self.params['seasons'] default = self.params['default'] tdigest_json = [el for el in self.metric_sink.read(self.tdigest_key)] if tdigest_json: centroids = json.loads(tdigest_json[0]) [self.td.add(c[0], c[1]) for c in centroids] # gather data and assure requirements data = [el for el in self.metric_sink.read(metric)] data = sorted(data, key=lambda tup: tup.timestamp) step_size = find_step_size(data) if not step_size: self.logger.error( 'Datapoints have no common time grid or are not enough. Exiting') return None if data[-1].timestamp - int(time()) > 2 * step_size: self.logger.error('Datapoints are too old (%d sec). Exiting' % ( data[-1].timestamp - int(time()))) return None data = insert_missing_datapoints(data, default, step_size) if len(data) < period_length * seasons: self.logger.error( 'Not enough (%d) datapoints. Exiting' % len(data)) return None data = data[-period_length * seasons - 1:-1] return data def process(self, data): if data: period_length = self.params['period_length'] error_type = self.params.get('error_type', 'norm') data = [float(el.value) for el in data] try: r_stl = robjects.r.stl r_ts = robjects.r.ts r_data_ts = r_ts(data, frequency=period_length) r_res = r_stl(r_data_ts, s_window="periodic", robust=True) r_res_ts = asarray(r_res[0]) seasonal = r_res_ts[:, 0][-1] trend = r_res_ts[:, 1][-1] _error = r_res_ts[:, 2][-1] model = seasonal + trend except Exception as e: self.logger.error('STL Call failed: %s. Exiting' % e) return (0.0, 0.0, 0.0, {'flag': -1}) if error_type == 'norm': error = _error / model if model != 0 else -1 elif error_type == 'median': error = data[-1] - seasonal - median(data) elif error_type == 'stl': error = _error # add error to distribution and evaluate self.td.add(error, 1.0) state = self.error_eval[self.params['error_handling']](error) self.metric_sink.write( [RedisGeneric(self.tdigest_key, self.td.serialize())]) return (seasonal, trend, error, state) else: return (0.0, 0.0, 0.0, {'flag': -1}) def write(self, state): (seasonal, trend, error, state) = state prefix = '%s.%s' % (self.plugin, self.service) now = int(time()) for name, value in state.iteritems(): self.sink.write( TimeSeriesTuple('%s.%s' % (prefix, name), now, value)) self.sink.write( TimeSeriesTuple('%s.%s' % (prefix, 'seasonal'), now, seasonal)) self.sink.write( TimeSeriesTuple('%s.%s' % (prefix, 'trend'), now, trend)) self.sink.write( TimeSeriesTuple('%s.%s' % (prefix, 'error'), now, error)) def run(self): data = self.read() state = self.process(data) self.write(state) return True
if path.exists(decompressed_fname) is False: print("Decompressing {}".format(filename)) decompress_file(filename) docs = [] tree = ET.iterparse(decompressed_fname) print("Reading {}\n".format(decompressed_fname)) progress = tqdm(unit="docs") doc = {} text = None comment = None username = None timestamp = None ts_digest = TDigest() for event, elem in tree: if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page": doc = {} doc["title"] = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}title") doc["text"] = text doc["comment"] = comment doc["username"] = username doc["timestamp"] = int(timestamp) ts_digest.update(int(timestamp)) if doc["text"] is not None and doc["comment"] is not None and doc["username"] is not None and doc[ "timestamp"] is not None: total_docs = total_docs + 1 docs.append(doc) progress.update() elem.clear() # won't need the children any more
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words, use_numeric_range_searchs, ts_digest, p_writes): total_benchmark_reads = 0 total_benchmark_writes = 0 all_csvfile = open(all_fname, 'a', newline='') bench_csvfile = open(bench_fname, 'w', newline='') all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) progress = tqdm(unit="docs", total=total_benchmark_commands) total_docs = len(docs) ## timestamp related timestamps_pdist = generate_lognormal_dist(total_benchmark_commands) min_ts = ts_digest.percentile(0.0) max_ts = ts_digest.percentile(100.0) query_range_digest = TDigest() generated_commands = 0 while generated_commands < total_benchmark_commands: query_ts_pdist = timestamps_pdist[generated_commands] percentile = (1.0 - query_ts_pdist) * 100.0 query_min_ts = ts_digest.percentile(percentile) random_doc_pos = random.randint(0, total_docs - 1) doc = docs[random_doc_pos] # decide read or write p_cmd = random.random() if p_cmd < p_writes: ## WRITE total_benchmark_writes = total_benchmark_writes + 1 generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"], doc["username"], doc["timestamp"], generated_commands) else: ## READ total_benchmark_reads = total_benchmark_reads + 1 words, totalW = getQueryWords(doc, stop_words, 2) choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0] generated_row = None numeric_range_str = "" if use_numeric_range_searchs: numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts) query_range_digest.update(int(max_ts - query_min_ts)) if choice == "simple-1word-query" and len(words) >= 1: generated_row = generate_ft_search_row(indexname, "simple-1word-query", "{}{}".format(numeric_range_str, words[0])) elif choice == "2word-union-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-union-query", "{}{} {}".format(numeric_range_str, words[0], words[1])) elif choice == "2word-intersection-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-intersection-query", "{}{}|{}".format(numeric_range_str, words[0], words[1])) if generated_row != None: # all_csv_writer.writerow(generated_row) # bench_csv_writer.writerow(generated_row) progress.update() generated_commands = generated_commands + 1 progress.close() bench_csvfile.close() all_csvfile.close() # print() xx = [] yy = [] p90 = query_range_digest.percentile(90.0) dataset_percent = ts_digest.cdf(p90) print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent)) print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts))) for centroid in query_range_digest.centroids_to_list(): ts_m = centroid["m"] xx.append(ts_m) yy.append(query_range_digest.cdf(ts_m)) plt.scatter(xx, yy) plt.title('EnWiki pages Query time range') plt.xlabel('Query time range') plt.ylabel('cdf') plt.xscale('log') plt.show() return total_benchmark_reads, total_benchmark_writes
numberOfEnds = 3 duration = 10000 lengthOfInterval = 1000 numberOfIntervals = round(duration / lengthOfInterval) # an inverval will not be logged unless an event happens afterwards, # so last intevall will not be logged # interval counter i = 1 j = 1 k = 1 l = 1 # stores values per interval t = 0 q = [0 for x in range(numberOfNodes)] c = [0 for y in range(numberOfNodes)] # stores all values throughput = [] timeToComplete = [] numberOfArrivals = [] cpu = [] # tdigest time = TDigest()
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0: # Series is not numeric or is all NaNs. return None logger.debug('column_summary - ' + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ['mean', 'min', 'max', 'std', 'sum']: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult['n'] = column_props[col]['notnulls'] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult['percentiles'] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult['median'] = colresult['percentiles'][50] colresult['iqr'] = (colresult['percentiles'][75] - colresult['percentiles'][25]) # Compute the t-digest. logger.debug('column_summary - {} - creating TDigest...'.format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug('column_summary - {} - testing log trans...'.format(col)) try: colresult['logtrans'] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning('test_logtrans has failed for column `{}`: {}'.format( col, e)) colresult['logtrans'] = False if colresult['logtrans']: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult['logtrans_mean'] = _tdigest_mean(logdigest) colresult['logtrans_std'] = _tdigest_std(logdigest) colresult['logtrans_IQR'] = (logdigest.percentile(75) - logdigest.percentile(25)) logger.debug('column_summary - {} - should {}be log-transformed'.format( col, 'NOT ' if not colresult['logtrans'] else '')) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug('column_summary - {} - computing histogram...'.format(col)) if column_props[col]['is_categorical']: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult['logtrans']: counts, log_edges = np.histogram(np.log10(data), density=False, bins='fd') edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins='fd') colresult['histogram'] = { 'counts': counts.tolist(), 'bin_edges': edges.tolist() } # Compute KDE logger.debug('column_summary - {} - computing KDE...'.format(col)) bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1) logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw)) if column_props[col]['is_categorical']: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult['min'], colresult['max'] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult['logtrans']) colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()} return {col: colresult, '_columns': [col]}