class Service(object): """A service that can be started or stopped.""" status = attr.ib(default=ServiceStatus.STOPPED, type=ServiceStatus, init=False) _service_lock = attr.ib(factory=threading.Lock, repr=False, init=False) def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_value, traceback): self.stop() self.join() def start(self): """Start the service.""" # Use a lock so we're sure that if 2 threads try to start the service at the same time, one of them will raise # an error. with self._service_lock: if self.status == ServiceStatus.RUNNING: raise ServiceAlreadyRunning("%s is already running" % self.__class__.__name__) self.status = ServiceStatus.RUNNING def stop(self): """Stop the service.""" self.status = ServiceStatus.STOPPED @staticmethod def join(timeout=None): """Join the service once stopped."""
class LockCollector(collector.CaptureSamplerCollector): """Record lock usage.""" nframes = attr.ib( factory=_attr.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) tracer = attr.ib(default=None) def start(self): """Start collecting `threading.Lock` usage.""" super(LockCollector, self).start() self.patch() def stop(self): """Stop collecting `threading.Lock` usage.""" self.unpatch() super(LockCollector, self).stop() def patch(self): """Patch the threading module for tracking lock allocation.""" # We only patch the lock from the `threading` module. # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile. self.original = threading.Lock def _allocate_lock(wrapped, instance, args, kwargs): lock = wrapped(*args, **kwargs) return _ProfiledLock(lock, self.recorder, self.tracer, self.nframes, self._capture_sampler) threading.Lock = FunctionWrapper(self.original, _allocate_lock) def unpatch(self): """Unpatch the threading module for tracking lock allocation.""" threading.Lock = self.original
class Profiler(object): """Run profiling while code is executed. Note that the whole Python process is profiled, not only the code executed. Data from all running threads are caught. If no collectors are provided, default ones are created. If no exporters are provided, default ones are created. """ collectors = attr.ib(factory=_build_default_collectors) exporters = attr.ib(factory=_build_default_exporters) schedulers = attr.ib(init=False, factory=list) status = attr.ib(init=False, type=ProfilerStatus, default=ProfilerStatus.STOPPED) def __attrs_post_init__(self): if self.exporters: for rec in self.recorders: self.schedulers.append(scheduler.Scheduler(recorder=rec, exporters=self.exporters)) @property def recorders(self): return set(c.recorder for c in self.collectors) def start(self): """Start the profiler.""" for col in self.collectors: try: col.start() except RuntimeError: # `tracemalloc` is unavailable? pass for s in self.schedulers: s.start() self.status = ProfilerStatus.RUNNING def stop(self, flush=True): """Stop the profiler. This stops all the collectors and schedulers, waiting for them to finish their operations. :param flush: Wait for the flush of the remaining events before stopping. """ for col in reversed(self.collectors): col.stop() for col in reversed(self.collectors): col.join() for s in reversed(self.schedulers): s.stop() if flush: for s in reversed(self.schedulers): s.join() self.status = ProfilerStatus.STOPPED
class CaptureSamplerCollector(Collector): capture_pct = attr.ib( factory=_attr.from_env("DD_PROFILING_CAPTURE_PCT", 5, float)) _capture_sampler = attr.ib(default=attr.Factory(_create_capture_sampler, takes_self=True), init=False, repr=False)
class Scheduler(_periodic.PeriodicService): """Schedule export of recorded data.""" recorder = attr.ib() exporters = attr.ib() _interval = attr.ib(factory=_attr.from_env("DD_PROFILING_UPLOAD_INTERVAL", 60, float)) _last_export = attr.ib(init=False, default=None) def start(self): """Start the scheduler.""" LOG.debug("Starting scheduler") super(Scheduler, self).start() self._last_export = compat.time_ns() LOG.debug("Scheduler started") def flush(self): """Flush events from recorder to exporters.""" LOG.debug("Flushing events") if self.exporters: events = self.recorder.reset() start = self._last_export self._last_export = compat.time_ns() total_events = sum(len(v) for v in events.values()) for exp in self.exporters: try: exp.export(events, start, self._last_export) except exporter.ExportError as e: LOG.error("Unable to export %d events: %s", total_events, _traceback.format_exception(e)) except Exception: LOG.exception("Error while exporting %d events", total_events) periodic = flush on_shutdown = flush
class Collector(object): """A profile collector.""" recorder = attr.ib() status = attr.ib(default=CollectorStatus.STOPPED, type=CollectorStatus, repr=False, init=False) def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_value, traceback): return self.stop() @staticmethod def _init(): pass @abc.abstractmethod def start(self): """Start collecting profiles.""" if self.status == CollectorStatus.RUNNING: raise RuntimeError("Collector is already running") self.status = CollectorStatus.RUNNING self._init() @abc.abstractmethod def stop(self): """Stop collecting profiles.""" self.status = CollectorStatus.STOPPED
class MemoryHeapSampleEvent(event.StackBasedEvent): """A sample storing memory allocation tracked.""" size = attr.ib(default=None) """Allocation size in bytes.""" sample_size = attr.ib(default=None) """The sampling size."""
class UncaughtExceptionEvent(event.Event): """A lock has been acquired.""" frames = attr.ib(default=None) nframes = attr.ib(default=None) exc_type = attr.ib(default=None) thread_id = attr.ib(default=None) thread_name = attr.ib(default=None)
class Scheduler(object): """Schedule export of recorded data.""" recorder = attr.ib() exporters = attr.ib() interval = attr.ib( factory=_attr.from_env("DD_PROFILING_UPLOAD_INTERVAL", 60, float)) _periodic = attr.ib(init=False, default=None) _last_export = attr.ib(init=False, default=None) def __enter__(self): self.start() return self def start(self): """Start the scheduler.""" self._periodic = _periodic.PeriodicThread( self.interval, self.flush, name="%s:%s" % (__name__, self.__class__.__name__)) LOG.debug("Starting scheduler") self._last_export = compat.time_ns() self._periodic.start() LOG.debug("Scheduler started") def __exit__(self, exc_type, exc_value, traceback): return self.stop() def stop(self, flush=True): """Stop the scheduler. :param flush: Whetever to do a final flush. """ LOG.debug("Stopping scheduler") if self._periodic: self._periodic.stop() self._periodic.join() self._periodic = None if flush: self.flush() LOG.debug("Scheduler stopped") def flush(self): """Flush events from recorder to exporters.""" LOG.debug("Flushing events") events = self.recorder.reset() start = self._last_export self._last_export = compat.time_ns() total_events = sum(len(v) for v in events.values()) for exp in self.exporters: try: exp.export(events, start, self._last_export) except exporter.ExportError as e: LOG.error("Unable to export %d events: %s", total_events, _traceback.format_exception(e)) except Exception: LOG.exception("Error while exporting %d events", total_events)
class Recorder(object): """An object that records program activity.""" _DEFAULT_MAX_EVENTS = 32768 default_max_events = attr.ib(default=_DEFAULT_MAX_EVENTS) """The maximum number of events for an event type if one is not specified.""" max_events = attr.ib(factory=dict) """A dict of {event_type_class: max events} to limit the number of events to record.""" events = attr.ib(init=False, repr=False) _events_lock = attr.ib(init=False, repr=False, factory=_nogevent.DoubleLock) def __attrs_post_init__(self): self._reset_events() def push_event(self, event): """Push an event in the recorder. :param event: The `ddtrace.profiling.event.Event` to push. """ return self.push_events([event]) def push_events(self, events): """Push multiple events in the recorder. All the events MUST be of the same type. There is no sanity check as whether all the events are from the same class for performance reasons. :param events: The event list to push. """ if events: event_type = events[0].__class__ with self._events_lock: q = self.events[event_type] q.extend(events) def _get_deque_for_event_type(self, event_type): return collections.deque(maxlen=self.max_events.get(event_type, self.default_max_events)) def _reset_events(self): self.events = _defaultdictkey(self._get_deque_for_event_type) def reset(self): """Reset the recorder. This is useful when e.g. exporting data. Once the event queue is retrieved, a new one can be created by calling the reset method, avoiding iterating on a mutating event list. :return: The list of events that has been removed. """ with self._events_lock: events = self.events self._reset_events() return events
class MemoryAllocSampleEvent(event.StackBasedEvent): """A sample storing memory allocation tracked.""" size = attr.ib(default=None) """Allocation size in bytes.""" capture_pct = attr.ib(default=None) """The capture percentage.""" nevents = attr.ib(default=None) """The total number of allocation events sampled."""
class MemoryCollector(collector.PeriodicCollector): """Memory allocation collector.""" _DEFAULT_MAX_EVENTS = 32 _DEFAULT_INTERVAL = 0.5 # Arbitrary interval to empty the _memalloc event buffer _interval = attr.ib(default=_DEFAULT_INTERVAL, repr=False) # TODO make this dynamic based on the 1. interval and 2. the max number of events allowed in the Recorder _max_events = attr.ib(factory=_attr.from_env("_DD_PROFILING_MEMORY_EVENTS_BUFFER", _DEFAULT_MAX_EVENTS, int)) max_nframe = attr.ib(factory=_attr.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) heap_sample_size = attr.ib(factory=_attr.from_env("DD_PROFILING_HEAP_SAMPLE_SIZE", 0, int)) ignore_profiler = attr.ib(factory=_attr.from_env("DD_PROFILING_IGNORE_PROFILER", True, formats.asbool)) def start(self): """Start collecting memory profiles.""" if _memalloc is None: raise RuntimeError("memalloc is unavailable") _memalloc.start(self.max_nframe, self._max_events, self.heap_sample_size) super(MemoryCollector, self).start() def stop(self): if _memalloc is not None: try: _memalloc.stop() except RuntimeError: pass super(MemoryCollector, self).stop() def collect(self): events, count, alloc_count = _memalloc.iter_events() capture_pct = 100 * count / alloc_count # TODO: The event timestamp is slightly off since it's going to be the time we copy the data from the # _memalloc buffer to our Recorder. This is fine for now, but we might want to store the nanoseconds # timestamp in C and then return it via iter_events. return ( tuple( MemoryAllocSampleEvent( thread_id=thread_id, thread_name=_threading.get_thread_name(thread_id), thread_native_id=_threading.get_thread_native_id(thread_id), frames=stack, nframes=nframes, size=size, capture_pct=capture_pct, nevents=alloc_count, ) for (stack, nframes, thread_id), size in events # TODO: this should be implemented in _memalloc directly so we have more space for samples # not coming from the profiler if not self.ignore_profiler or not any(frame[0].startswith(_MODULE_TOP_DIR) for frame in stack) ), )
class _Sequence(object): start_at = attr.ib(default=1) next_id = attr.ib(init=False, default=None) def __attrs_post_init__(self): self.next_id = self.start_at def generate(self): """Generate a new unique id and return it.""" generated_id = self.next_id self.next_id += 1 return generated_id
class PeriodicService(service.Service): """A service that runs periodically.""" _interval = attr.ib() _worker = attr.ib(default=None, init=False, repr=False) _real_thread = False "Class variable to override if the service should run in a real OS thread." @property def interval(self): return self._interval @interval.setter def interval(self, value): self._interval = value # Update the interval of the PeriodicThread based on ours if self._worker: self._worker.interval = value def _start(self): # type: () -> None """Start the periodic service.""" periodic_thread_class = PeriodicRealThreadClass( ) if self._real_thread else PeriodicThread self._worker = periodic_thread_class( self.interval, target=self.periodic, name="%s:%s" % (self.__class__.__module__, self.__class__.__name__), on_shutdown=self.on_shutdown, ) self._worker.start() def join(self, timeout=None): if self._worker: self._worker.join(timeout) def stop(self): """Stop the periodic collector.""" if self._worker: self._worker.stop() super(PeriodicService, self).stop() @staticmethod def on_shutdown(): pass @staticmethod def periodic(): pass
class StackBasedEvent(SampleEvent): thread_id = attr.ib(default=None) thread_name = attr.ib(default=None) thread_native_id = attr.ib(default=None) task_id = attr.ib(default=None) task_name = attr.ib(default=None) frames = attr.ib(default=None) nframes = attr.ib(default=None) trace_ids = attr.ib(default=None) span_ids = attr.ib(default=None)
class Scheduler(_periodic.PeriodicService): """Schedule export of recorded data.""" recorder = attr.ib() exporters = attr.ib() _interval = attr.ib( factory=_attr.from_env("DD_PROFILING_UPLOAD_INTERVAL", 60, float)) _configured_interval = attr.ib(init=False) _last_export = attr.ib(init=False, default=None) def __attrs_post_init__(self): # Copy the value to use it later since we're going to adjust the real interval self._configured_interval = self.interval def start(self): """Start the scheduler.""" LOG.debug("Starting scheduler") super(Scheduler, self).start() self._last_export = compat.time_ns() LOG.debug("Scheduler started") def flush(self): """Flush events from recorder to exporters.""" LOG.debug("Flushing events") if self.exporters: events = self.recorder.reset() start = self._last_export self._last_export = compat.time_ns() for exp in self.exporters: try: exp.export(events, start, self._last_export) except exporter.ExportError as e: LOG.error("Unable to export profile: %s. Ignoring.", _traceback.format_exception(e)) except Exception: LOG.exception( "Unexpected error while exporting events. " "Please report this bug to https://github.com/DataDog/dd-trace-py/issues" ) def periodic(self): start_time = compat.monotonic() try: self.flush() finally: self.interval = max( 0, self._configured_interval - (compat.monotonic() - start_time)) on_shutdown = flush
class PeriodicCollector(Collector): """A collector that needs to run periodically.""" _real_thread = False _interval = attr.ib(repr=False) _worker = attr.ib(default=None, init=False, repr=False) def start(self): """Start the periodic collector.""" super(PeriodicCollector, self).start() periodic_thread_class = _periodic.PeriodicRealThread if self._real_thread else _periodic.PeriodicThread self._worker = periodic_thread_class( self.interval, target=self.collect, name="%s:%s" % (__name__, self.__class__.__name__)) self._worker.start() @property def interval(self): return self._interval @interval.setter def interval(self, value): self._interval = value # Update the interval of the PeriodicThread based on ours if self._worker: self._worker.interval = value def stop(self): """Stop the periodic collector.""" if self._worker: self._worker.stop() self._worker.join() self._worker = None super(PeriodicCollector, self).stop() def collect(self): """Collect events and push them into the recorder.""" for events in self._collect(): self.recorder.push_events(events) @staticmethod def _collect(): """Collect the actual data. :return: A list of sample list to push in the recorder. """ raise NotImplementedError
class UncaughtExceptionCollector(collector.Collector): """Record uncaught thrown exceptions.""" max_nframes = attr.ib( factory=_attr.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) def start(self): """Start collecting uncaught exceptions.""" self.original_except_hook = sys.excepthook sys.excepthook = self.except_hook super(UncaughtExceptionCollector, self).start() def stop(self): """Stop collecting uncaught exceptions.""" if hasattr(self, "original_except_hook"): sys.excepthook = self.original_except_hook del self.original_except_hook super(UncaughtExceptionCollector, self).stop() def except_hook(self, exctype, value, traceback): try: frames, nframes = _traceback.traceback_to_frames( traceback, self.max_nframes) thread_id, thread_name = threading._current_thread() self.recorder.push_event( UncaughtExceptionEvent(frames=frames, nframes=nframes, thread_id=thread_id, thread_name=thread_name, exc_type=exctype)) finally: return self.original_except_hook(exctype, value, traceback)
class Service(object): """A service that can be started or stopped.""" status = attr.ib(default=ServiceStatus.STOPPED, type=ServiceStatus, init=False) def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_value, traceback): return self.stop() def start(self): """Start the service.""" if self.status == ServiceStatus.RUNNING: raise RuntimeError("%s is already running" % self.__class__.__name__) self.status = ServiceStatus.RUNNING def stop(self): """Stop the service.""" self.status = ServiceStatus.STOPPED @staticmethod def join(timeout=None): """Join the service once stopped."""
class CollectorStatus(object): """A Collector status.""" status = attr.ib() def __repr__(self): return self.status.upper()
class ProfilerStatus(object): """A Profiler status.""" status = attr.ib() def __repr__(self): return self.status.upper()
class Collector(service.Service): """A profile collector.""" recorder = attr.ib() @staticmethod def snapshot(): """Take a snapshot of collected data.
class CaptureSampler(object): """Determine the events that should be captured based on a sampling percentage.""" capture_pct = attr.ib(default=100) _counter = attr.ib(default=0, init=False) @capture_pct.validator def capture_pct_validator(self, attribute, value): if value < 0 or value > 100: raise ValueError("Capture percentage should be between 0 and 100 included") def capture(self): self._counter += self.capture_pct if self._counter >= 100: self._counter -= 100 return True return False
class _StringTable(object): _strings = attr.ib(init=False, factory=lambda: {"": 0}) _seq_id = attr.ib(init=False, factory=_Sequence) def to_id(self, string): try: return self._strings[string] except KeyError: generated_id = self._strings[string] = self._seq_id.generate() return generated_id def __iter__(self): for string, _ in sorted(self._strings.items(), key=_ITEMGETTER_ONE): yield string def __len__(self): return len(self._strings)
class Event(object): """An event happening at a point in time.""" timestamp = attr.ib(factory=compat.time_ns) @property def name(self): """Name of the event.""" return self.__class__.__name__
class PprofFileExporter(pprof.PprofExporter): """PProf file exporter.""" prefix = attr.ib() _increment = attr.ib(default=1, init=False, repr=False) def export(self, events): """Export events to pprof file. The file name is based on the prefix passed to init. The process ID number and type of export is then added as a suffix. :param events: The event dictionary from a `ddtrace.profile.recorder.Recorder`. """ profile = super(PprofFileExporter, self).export(events) with gzip.open( self.prefix + (".%d.%d" % (os.getpid(), self._increment)), "wb") as f: f.write(profile.SerializeToString()) self._increment += 1
class DoubleLock(object): """A lock that prevent concurrency from a gevent coroutine and from a threading.Thread at the same time.""" _lock = attr.ib(factory=threading.Lock, init=False, repr=False) _thread_lock = attr.ib(factory=Lock, init=False, repr=False) def acquire(self): # You cannot acquire a gevent-lock from another thread if it has been acquired already: # make sure we exclude the gevent-lock from being acquire by another thread by using a thread-lock first. self._thread_lock.acquire() self._lock.acquire() def release(self): self._lock.release() self._thread_lock.release() def __enter__(self): self.acquire() return self def __exit__(self, exc_type, exc_val, exc_tb): self.release()
class RuntimeWorker(periodic.PeriodicService): """Worker thread for collecting and writing runtime metrics to a DogStatsd client. """ dogstatsd_url = attr.ib(type=str) _interval = attr.ib( type=float, factory=lambda: float( get_env("runtime_metrics", "interval", default=10))) _dogstatsd_client = attr.ib(init=False, repr=False) _runtime_metrics = attr.ib(factory=RuntimeMetrics, repr=False) def __attrs_post_init__(self): # type: () -> None self._dogstatsd_client = get_dogstatsd_client(self.dogstatsd_url) # force an immediate update constant tags self.update_runtime_tags() # Start worker thread self.start() def flush(self): # type: () -> None with self._dogstatsd_client: for key, value in self._runtime_metrics: log.debug("Writing metric %s:%s", key, value) self._dogstatsd_client.gauge(key, value) def update_runtime_tags(self): # type: () -> None # DEV: ddstatsd expects tags in the form ['key1:value1', 'key2:value2', ...] tags = ["{}:{}".format(k, v) for k, v in RuntimeTags()] log.debug("Updating constant tags %s", tags) self._dogstatsd_client.constant_tags = tags periodic = flush on_shutdown = flush
class LockEventBase(event.Event): """Base Lock event.""" lock_name = attr.ib(default=None) frames = attr.ib(default=None) nframes = attr.ib(default=None) thread_id = attr.ib(default=None) thread_name = attr.ib(default=None) sampling_pct = attr.ib(default=None)
class TraceProcessor(object): _filters = attr.ib(type=List[TraceFilter]) def process(self, trace): # type: (List[Span]) -> Optional[List[Span]] for filtr in self._filters: try: trace = filtr.process_trace(trace) except Exception: log.error("error applying filter %r to traces", filtr, exc_info=True) else: if trace is None: log.debug("dropping trace due to filter %r", filter) return return trace