def run( ctx: click.Context, config: str, dry_run: bool, preview: bool, strict_warnings: bool, preview_workunits: int, ) -> None: """Ingest metadata into DataHub.""" logger.info("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits) except ValidationError as e: click.echo(e, err=True) sys.exit(1) except Exception as e: # The pipeline_config may contain sensitive information, so we wrap the exception # in a SensitiveError to prevent detailed variable-level information from being logged. raise SensitiveError() from e logger.info("Starting metadata ingestion") pipeline.run() logger.info("Finished metadata ingestion") ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) pipeline.log_ingestion_stats() sys.exit(ret)
class SourceReport(Report): workunits_produced: int = 0 workunit_ids: List[str] = field(default_factory=list) warnings: Dict[str, List[str]] = field(default_factory=dict) failures: Dict[str, List[str]] = field(default_factory=dict) cli_version: str = datahub.nice_version_name() cli_entry_location: str = datahub.__file__ py_version: str = sys.version py_exec_path: str = sys.executable os_details: str = platform.platform() def report_workunit(self, wu: WorkUnit) -> None: self.workunits_produced += 1 self.workunit_ids.append(wu.id) def report_warning(self, key: str, reason: str) -> None: if key not in self.warnings: self.warnings[key] = [] self.warnings[key].append(reason) def report_failure(self, key: str, reason: str) -> None: if key not in self.failures: self.failures[key] = [] self.failures[key].append(reason)
def __init__(self): # init the client ID and config if it doesn't exist if not CONFIG_FILE.exists(): self.client_id = str(uuid.uuid4()) self.update_config() else: self.load_config() # send updated user-level properties self.mp = None if self.enabled: try: self.mp = Mixpanel( MIXPANEL_TOKEN, consumer=Consumer(request_timeout=int(TIMEOUT))) self.mp.people_set( self.client_id, { "datahub_version": datahub_package.nice_version_name(), "os": platform.system(), "python_version": platform.python_version(), }, ) except Exception as e: logger.debug(f"Error connecting to mixpanel: {e}")
def ping( self, category: str, action: str, label: Optional[str] = None, value: Optional[int] = None, ) -> None: """ Ping Google Analytics with a single event. Args: category (str): category for the event action (str): action taken label (Optional[str], optional): label for the event value (Optional[int], optional): value for the event """ if not self.enabled: return req_url = "https://www.google-analytics.com/collect" params: Dict[str, Union[str, int]] = { "an": "datahub-cli", # app name "av": datahub_package.nice_version_name(), # app version "t": "event", # event type "v": GA_VERSION, # Google Analytics version "tid": GA_TID, # tracking id "cid": self.client_id, # client id "ec": category, # event category "ea": action, # event action # use custom dimensions to capture OS and Python version # see https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters#cd_ "cd1": platform.system(), # OS "cd2": platform.python_version(), # Python version } if label: params["el"] = label # this has to a non-negative int, otherwise the request will fail if value: params["ev"] = value try: requests.post( req_url, data=params, headers={ "user-agent": f"datahub {datahub_package.nice_version_name()}" }, ) except Exception as e: logger.debug(f"Error reporting telemetry: {e}")
def run( ctx: click.Context, config: str, dry_run: bool, preview: bool, strict_warnings: bool, preview_workunits: int, suppress_error_logs: bool, ) -> None: """Ingest metadata into DataHub.""" logger.info("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits) except ValidationError as e: click.echo(e, err=True) sys.exit(1) except Exception as e: # The pipeline_config may contain sensitive information, so we wrap the exception # in a SensitiveError to prevent detailed variable-level information from being logged. raise SensitiveError() from e logger.info("Starting metadata ingestion") try: pipeline.run() except Exception as e: logger.info( f"Source ({pipeline.config.source.type}) report:\n{pipeline.source.get_report().as_string()}" ) logger.info( f"Sink ({pipeline.config.sink.type}) report:\n{pipeline.sink.get_report().as_string()}" ) # We dont want to log sensitive information in variables if the pipeline fails due to # an unexpected error. Disable printing sensitive info to logs if ingestion is running # with `--suppress-error-logs` flag. if suppress_error_logs: raise SensitiveError() from e else: raise e else: logger.info("Finished metadata pipeline") pipeline.log_ingestion_stats() ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) sys.exit(ret)
def init_tracking(self) -> None: if not self.enabled or self.mp is None or self.tracking_init is True: return logger.debug("Sending init Telemetry") try: self.mp.people_set( self.client_id, { "datahub_version": datahub_package.nice_version_name(), "os": platform.system(), "python_version": platform.python_version(), }, ) except Exception as e: logger.debug(f"Error reporting telemetry: {e}") self.init_track = True
def run(config: str) -> None: """Ingest metadata into DataHub.""" logger.debug("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) except ValidationError as e: click.echo(e, err=True) sys.exit(1) logger.info("Starting metadata ingestion") pipeline.run() logger.info("Finished metadata ingestion") ret = pipeline.pretty_print_summary() sys.exit(ret)
def run(config: str, dry_run: bool, preview: bool, strict_warnings: bool) -> None: """Ingest metadata into DataHub.""" logger.debug("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config, dry_run, preview) except ValidationError as e: click.echo(e, err=True) sys.exit(1) logger.info("Starting metadata ingestion") pipeline.run() logger.info("Finished metadata ingestion") ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) pipeline.log_ingestion_stats() sys.exit(ret)
# Configure logger. BASE_LOGGING_FORMAT = ( "[%(asctime)s] %(levelname)-8s {%(name)s:%(lineno)d} - %(message)s") logging.basicConfig(format=BASE_LOGGING_FORMAT) MAX_CONTENT_WIDTH = 120 @click.group(context_settings=dict( # Avoid truncation of help text. # See https://github.com/pallets/click/issues/486. max_content_width=MAX_CONTENT_WIDTH, )) @click.option("--debug/--no-debug", default=False) @click.version_option( version=datahub_package.nice_version_name(), prog_name=datahub_package.__package_name__, ) def datahub(debug: bool) -> None: if debug or os.getenv("DATAHUB_DEBUG", False): logging.getLogger().setLevel(logging.INFO) logging.getLogger("datahub").setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.WARNING) logging.getLogger("datahub").setLevel(logging.INFO) # loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] # print(loggers) @datahub.command() def version() -> None: