def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", "read", # Check passing str values too success=False, with_schema=False, )
def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", DbndDatasetOperationType.read, row_count=987, column_count=4, )
def flush_operations(self, connection: PostgresConnectionWrapper): if connection in self.connections: for op in self.connections.get_operations(connection): if self.conf.with_schema: op.extract_schema(connection) if self.conf.with_stats: op.extract_stats(connection) if self.conf.with_preview: op.extract_preview(connection) log_dataset_op( op_path=op.render_connection_path(connection), op_type=op.op_type, success=op.success, data=op, error=op.error, with_preview=self.conf.with_preview, send_metrics=self.conf.send_metrics, with_schema=self.conf.with_schema, with_partition=self.conf.with_partition, with_stats=self.conf.with_stats, with_histograms=self.conf.with_histograms, ) # we clean all the batch of operations we reported so we don't report twice self.connections.clear_operations(connection)
def report_operations(self, connection: SnowflakeConnection, operations: List[SqlOperation]): if connection.is_closed(): # already closed, cannot proceed (and probably already tracked) return # update the tables names operations = [op.evolve_table_name(connection) for op in operations] # looks for tables schemas tables = chain.from_iterable(op.tables for op in operations if not op.is_file) tables_schemas: Dict[str, DTypes] = {} for table in tables: table_schema = get_snowflake_table_schema(connection, table) if table_schema: tables_schemas[table] = table_schema operations: List[SqlOperation] = [ op.evolve_schema(tables_schemas) for op in operations ] for op in operations: log_dataset_op( op_path=render_connection_path(connection, op, "snowflake"), op_type=op.op_type, success=op.success, data=op, with_schema=True, send_metrics=True, )
def task_with_log_datasets(): log_dataset_op( "/path/to/value.csv", DbndDatasetOperationType.read, data=pandas_data_frame, with_preview=True, with_schema=True, )
def get_data(source: str, days: int): today = pd.Timestamp.now().today() start_date = today - pd.Timedelta(days=days) name = ( f"test_data_{today.strftime('%Y-%m-%d')}_{start_date.strftime('%Y-%m-%d')}.csv" ) file_path = os.path.join(source, name) records_amounts = int((days * 24 * 10) * uniform(0.5, 1.5)) df = pd.DataFrame(np.random.randint(0, 100, size=(records_amounts, 4)), columns=list("ABCD")) df["dates"] = [random_date(start_date, today) for _ in range(len(df))] log_dataset_op( op_path=file_path, op_type=DbndDatasetOperationType.read, data=df, with_schema=True, with_preview=True, with_histograms=True, ) return df
def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", DbndDatasetOperationType.read, with_schema=False, )
def task_with_log_datasets(): a_target = target("/path/to/value.csv") log_dataset_op(a_target, DbndDatasetOperationType.read, with_schema=False)