class LockCollector(collector.CaptureSamplerCollector): """Record lock usage.""" nframes = attr.ib( factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) endpoint_collection_enabled = attr.ib(factory=attr_utils.from_env( "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", True, formats.asbool)) tracer = attr.ib(default=None) _original = attr.ib(init=False, repr=False, type=typing.Any, cmp=False) @abc.abstractmethod def _get_original(self): # type: (...) -> typing.Any pass @abc.abstractmethod def _set_original( self, value # type: typing.Any ): # type: (...) -> None pass def _start_service(self): # type: ignore[override] # type: (...) -> None """Start collecting lock usage.""" self.patch() super(LockCollector, self)._start_service() def _stop_service(self): # type: ignore[override] # type: (...) -> None """Stop collecting lock usage.""" super(LockCollector, self)._stop_service() self.unpatch() def patch(self): # type: (...) -> None """Patch the module for tracking lock allocation.""" # We only patch the lock from the `threading` module. # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile. self.original = self._get_original() def _allocate_lock(wrapped, instance, args, kwargs): lock = wrapped(*args, **kwargs) return self.PROFILED_LOCK_CLASS(lock, self.recorder, self.tracer, self.nframes, self._capture_sampler, self.endpoint_collection_enabled) self._set_original(FunctionWrapper(self.original, _allocate_lock)) def unpatch(self): # type: (...) -> None """Unpatch the threading module for tracking lock allocation.""" self._set_original(self.original)
class LockCollector(collector.CaptureSamplerCollector): """Record lock usage.""" nframes = attr.ib( factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) endpoint_collection_enabled = attr.ib(factory=attr_utils.from_env( "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", True, formats.asbool)) tracer = attr.ib(default=None) def _start_service(self): # type: ignore[override] # type: (...) -> None """Start collecting `threading.Lock` usage.""" self.patch() super(LockCollector, self)._start_service() def _stop_service(self): # type: ignore[override] # type: (...) -> None """Stop collecting `threading.Lock` usage.""" super(LockCollector, self)._stop_service() self.unpatch() def patch(self): # type: (...) -> None """Patch the threading module for tracking lock allocation.""" # We only patch the lock from the `threading` module. # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile. self.original = threading.Lock def _allocate_lock(wrapped, instance, args, kwargs): lock = wrapped(*args, **kwargs) return _ProfiledLock(lock, self.recorder, self.tracer, self.nframes, self._capture_sampler, self.endpoint_collection_enabled) threading.Lock = FunctionWrapper(self.original, _allocate_lock) # type: ignore[misc] def unpatch(self): # type: (...) -> None """Unpatch the threading module for tracking lock allocation.""" threading.Lock = self.original # type: ignore[misc]
class Scheduler(periodic.PeriodicService): """Schedule export of recorded data.""" recorder = attr.ib() exporters = attr.ib() before_flush = attr.ib(default=None, eq=False) _interval = attr.ib(factory=attr_utils.from_env("DD_PROFILING_UPLOAD_INTERVAL", 60.0, float)) _configured_interval = attr.ib(init=False) _last_export = attr.ib(init=False, default=None, eq=False) def __attrs_post_init__(self): # Copy the value to use it later since we're going to adjust the real interval self._configured_interval = self.interval def _start_service(self): # type: ignore[override] # type: (...) -> None """Start the scheduler.""" LOG.debug("Starting scheduler") super(Scheduler, self)._start_service() self._last_export = compat.time_ns() LOG.debug("Scheduler started") def flush(self): """Flush events from recorder to exporters.""" LOG.debug("Flushing events") if self.before_flush is not None: try: self.before_flush() except Exception: LOG.error("Scheduler before_flush hook failed", exc_info=True) if self.exporters: events = self.recorder.reset() start = self._last_export self._last_export = compat.time_ns() for exp in self.exporters: try: exp.export(events, start, self._last_export) except exporter.ExportError as e: LOG.error("Unable to export profile: %s. Ignoring.", _traceback.format_exception(e)) except Exception: LOG.exception( "Unexpected error while exporting events. " "Please report this bug to https://github.com/DataDog/dd-trace-py/issues" ) def periodic(self): start_time = compat.monotonic() try: self.flush() finally: self.interval = max(0, self._configured_interval - (compat.monotonic() - start_time))
class CaptureSamplerCollector(Collector): capture_pct = attr.ib(factory=attr_utils.from_env("DD_PROFILING_CAPTURE_PCT", 2.0, float)) _capture_sampler = attr.ib(default=attr.Factory(_create_capture_sampler, takes_self=True), init=False, repr=False)
class MemoryCollector(collector.PeriodicCollector): """Memory allocation collector.""" _DEFAULT_MAX_EVENTS = 32 _DEFAULT_INTERVAL = 0.5 # Arbitrary interval to empty the _memalloc event buffer _interval = attr.ib(default=_DEFAULT_INTERVAL, repr=False) # TODO make this dynamic based on the 1. interval and 2. the max number of events allowed in the Recorder _max_events = attr.ib(factory=attr_utils.from_env( "_DD_PROFILING_MEMORY_EVENTS_BUFFER", _DEFAULT_MAX_EVENTS, int)) max_nframe = attr.ib( factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int)) heap_sample_size = attr.ib(type=int, factory=_get_default_heap_sample_size) ignore_profiler = attr.ib(factory=attr_utils.from_env( "DD_PROFILING_IGNORE_PROFILER", False, formats.asbool)) def _start_service(self): # type: ignore[override] # type: (...) -> None """Start collecting memory profiles.""" if _memalloc is None: raise collector.CollectorUnavailable _memalloc.start(self.max_nframe, self._max_events, self.heap_sample_size) super(MemoryCollector, self)._start_service() def _stop_service(self): # type: ignore[override] # type: (...) -> None super(MemoryCollector, self)._stop_service() if _memalloc is not None: try: _memalloc.stop() except RuntimeError: pass def _get_thread_id_ignore_set(self): # type: () -> typing.Set[int] # This method is not perfect and prone to race condition in theory, but very little in practice. # Anyhow it's not a big deal — it's a best effort feature. return { thread.ident for thread in threading.enumerate() if getattr(thread, "_ddtrace_profiling_ignore", False) and thread.ident is not None } def snapshot(self): thread_id_ignore_set = self._get_thread_id_ignore_set() return (tuple( MemoryHeapSampleEvent( thread_id=thread_id, thread_name=_threading.get_thread_name(thread_id), thread_native_id=_threading.get_thread_native_id(thread_id), frames=stack, nframes=nframes, size=size, sample_size=self.heap_sample_size, ) for (stack, nframes, thread_id), size in _memalloc.heap() if not self.ignore_profiler or thread_id not in thread_id_ignore_set), ) def collect(self): events, count, alloc_count = _memalloc.iter_events() capture_pct = 100 * count / alloc_count thread_id_ignore_set = self._get_thread_id_ignore_set() # TODO: The event timestamp is slightly off since it's going to be the time we copy the data from the # _memalloc buffer to our Recorder. This is fine for now, but we might want to store the nanoseconds # timestamp in C and then return it via iter_events. return (tuple( MemoryAllocSampleEvent( thread_id=thread_id, thread_name=_threading.get_thread_name(thread_id), thread_native_id=_threading.get_thread_native_id(thread_id), frames=stack, nframes=nframes, size=size, capture_pct=capture_pct, nevents=alloc_count, ) for (stack, nframes, thread_id), size in events if not self.ignore_profiler or thread_id not in thread_id_ignore_set), )
class PprofHTTPExporter(pprof.PprofExporter): """PProf HTTP exporter.""" endpoint = attr.ib() api_key = attr.ib(default=None) # Do not use the default agent timeout: it is too short, the agent is just a unbuffered proxy and the profiling # backend is not as fast as the tracer one. timeout = attr.ib(factory=attr_utils.from_env("DD_PROFILING_API_TIMEOUT", 10.0, float), type=float) service = attr.ib(default=None) env = attr.ib(default=None) version = attr.ib(default=None) tags = attr.ib(factory=dict) max_retry_delay = attr.ib(default=None) _container_info = attr.ib(factory=container.get_container_info, repr=False) _retry_upload = attr.ib(init=False, eq=False) endpoint_path = attr.ib(default="/profiling/v1/input") def __attrs_post_init__(self): if self.max_retry_delay is None: self.max_retry_delay = self.timeout * 3 self._retry_upload = tenacity.Retrying( # Retry after 1s, 2s, 4s, 8s with some randomness wait=tenacity.wait_random_exponential(multiplier=0.5), stop=tenacity.stop_after_delay(self.max_retry_delay), retry_error_cls=UploadFailed, retry=tenacity.retry_if_exception_type((http_client.HTTPException, OSError, IOError)), ) tags = { k: six.ensure_binary(v) for k, v in itertools.chain( parse_tags_str(os.environ.get("DD_TAGS")).items(), parse_tags_str(os.environ.get("DD_PROFILING_TAGS")).items(), ) } tags.update({k: six.ensure_binary(v) for k, v in self.tags.items()}) tags.update( { "host": HOSTNAME.encode("utf-8"), "language": b"python", "runtime": PYTHON_IMPLEMENTATION, "runtime_version": PYTHON_VERSION, "profiler_version": ddtrace.__version__.encode("ascii"), } ) if self.version: tags["version"] = self.version.encode("utf-8") if self.env: tags["env"] = self.env.encode("utf-8") self.tags = tags @staticmethod def _encode_multipart_formdata(fields, tags): boundary = binascii.hexlify(os.urandom(16)) # The body that is generated is very sensitive and must perfectly match what the server expects. body = ( b"".join( b"--%s\r\n" b'Content-Disposition: form-data; name="%s"\r\n' b"\r\n" b"%s\r\n" % (boundary, field.encode(), value) for field, value in fields.items() if field != "chunk-data" ) + b"".join( b"--%s\r\n" b'Content-Disposition: form-data; name="tags[]"\r\n' b"\r\n" b"%s:%s\r\n" % (boundary, tag.encode(), value) for tag, value in tags.items() ) + b"--" + boundary + b"\r\n" b'Content-Disposition: form-data; name="chunk-data"; filename="profile.pb.gz"\r\n' + b"Content-Type: application/octet-stream\r\n\r\n" + fields["chunk-data"] + b"\r\n--%s--\r\n" % boundary ) content_type = b"multipart/form-data; boundary=%s" % boundary return content_type, body def _get_tags(self, service): tags = { "service": service.encode("utf-8"), "runtime-id": runtime.get_runtime_id().encode("ascii"), } tags.update(self.tags) return tags def export(self, events, start_time_ns, end_time_ns): """Export events to an HTTP endpoint. :param events: The event dictionary from a `ddtrace.profiling.recorder.Recorder`. :param start_time_ns: The start time of recording. :param end_time_ns: The end time of recording. """ if self.api_key: headers = { "DD-API-KEY": self.api_key.encode(), } else: headers = {} if self._container_info and self._container_info.container_id: headers["Datadog-Container-Id"] = self._container_info.container_id profile = super(PprofHTTPExporter, self).export(events, start_time_ns, end_time_ns) s = six.BytesIO() with gzip.GzipFile(fileobj=s, mode="wb") as gz: gz.write(profile.SerializeToString()) fields = { "runtime-id": runtime.get_runtime_id().encode("ascii"), "recording-start": ( datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z" ).encode(), "recording-end": ( datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z" ).encode(), "runtime": PYTHON_IMPLEMENTATION, "format": b"pprof", "type": b"cpu+alloc+exceptions", "chunk-data": s.getvalue(), } service = self.service or os.path.basename(profile.string_table[profile.mapping[0].filename]) content_type, body = self._encode_multipart_formdata( fields, tags=self._get_tags(service), ) headers["Content-Type"] = content_type client = agent.get_connection(self.endpoint, self.timeout) self._upload(client, self.endpoint_path, body, headers) def _upload(self, client, path, body, headers): self._retry_upload(self._upload_once, client, path, body, headers) def _upload_once(self, client, path, body, headers): try: client.request("POST", path, body=body, headers=headers) response = client.getresponse() response.read() # reading is mandatory finally: client.close() if 200 <= response.status < 300: return if 500 <= response.status < 600: raise tenacity.TryAgain if response.status == 400: raise exporter.ExportError("Server returned 400, check your API key") elif response.status == 404 and not self.api_key: raise exporter.ExportError( "Datadog Agent is not accepting profiles. " "Agent-based profiling deployments require Datadog Agent >= 7.20" ) raise exporter.ExportError("HTTP Error %d" % response.status)