Пример #1
0
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle:
    target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1])

    file_cache = context.resources.file_cache

    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        with get_temp_file_name() as tmp_file:
            context.resources.s3.download_file(
                Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file
            )

            context.log.info("File downloaded to {}".format(tmp_file))

            with open(tmp_file, "rb") as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)],
    )
    yield Output(target_file_handle)
Пример #2
0
    def handle_output(
        self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]
    ):

        path = self._get_path(context)
        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")
Пример #3
0
    def handle_output(self, context, obj: pd.DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        obj.to_csv(fpath)
        with open(fpath + ".version", "w") as f:
            f.write(context.version if context.version else "None")

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.path(fpath, "Path")
        yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample")
        yield MetadataEntry.text(context.version, "Resolved version")
        yield MetadataEntry.table_schema(
            self.get_schema(context.dagster_type),
            "Schema",
        )
Пример #4
0
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config["Bucket"]
    key = context.solid_config["Key"]

    with context.resources.file_manager.read(file_handle, "rb") as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield AssetMaterialization(
            asset_key=s3_file_handle.s3_path,
            metadata_entries=[
                MetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))
            ],
        )

        yield Output(value=s3_file_handle, output_name="s3_file_handle")
Пример #5
0
    def handle_output(self, context: OutputContext,
                      obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]):
        path = self._get_path(context)
        if "://" not in self._base_path:
            os.makedirs(os.path.dirname(path), exist_ok=True)

        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            context.log.info(f"Row count: {row_count}")
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")