def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle: target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1]) file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.download_file( Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file ) context.log.info("File downloaded to {}".format(tmp_file)) with open(tmp_file, "rb") as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info("File handle written at : {}".format(target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format(target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)], ) yield Output(target_file_handle)
def handle_output( self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame] ): path = self._get_path(context) if isinstance(obj, pandas.DataFrame): row_count = len(obj) obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")
def handle_output(self, context, obj: pd.DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) os.makedirs(os.path.dirname(fpath), exist_ok=True) obj.to_csv(fpath) with open(fpath + ".version", "w") as f: f.write(context.version if context.version else "None") yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.path(fpath, "Path") yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample") yield MetadataEntry.text(context.version, "Resolved version") yield MetadataEntry.table_schema( self.get_schema(context.dagster_type), "Schema", )
def file_handle_to_s3(context, file_handle): bucket = context.solid_config["Bucket"] key = context.solid_config["Key"] with context.resources.file_manager.read(file_handle, "rb") as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield AssetMaterialization( asset_key=s3_file_handle.s3_path, metadata_entries=[ MetadataEntry.path(s3_file_handle.s3_path, label=last_key(key)) ], ) yield Output(value=s3_file_handle, output_name="s3_file_handle")
def handle_output(self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]): path = self._get_path(context) if "://" not in self._base_path: os.makedirs(os.path.dirname(path), exist_ok=True) if isinstance(obj, pandas.DataFrame): row_count = len(obj) context.log.info(f"Row count: {row_count}") obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")