Exemplo n.º 1
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Exemplo n.º 2
0
def _delete_from_s3_pre_delete(sender, instance, **kwargs):
    """Delete file from S3, pre-delete.

    Why pre-delete and not post-delete? Because our user expects the file to be
    _gone_, completely, forever -- that's what "delete" means to the user. If
    deletion fails, we need the link to remain in our database -- that's how
    the user will know it isn't deleted.
    """
    if instance.key:
        s3.remove(s3.StoredObjectsBucket, instance.key)
Exemplo n.º 3
0
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, step.id, path)
        step.stored_data_version = so.stored_at
        step.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        s3.remove(s3.StoredObjectsBucket, so.key)

        module_zipfile = create_module_zipfile(
            "x",
            spec_kwargs={"loads_data": True},
            python_code=textwrap.dedent(
                """
                import pandas as pd
                def render(table, params, *, fetch_result, **kwargs):
                    assert fetch_result is None
                    return pd.DataFrame()
                """
            ),
        )

        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                execute_step(
                    chroot_context=self.chroot_context,
                    workflow=workflow,
                    step=step,
                    module_zipfile=module_zipfile,
                    params={},
                    tab_name=tab.name,
                    input_path=self.empty_table_path,
                    input_table_columns=[],
                    tab_results={},
                    output_path=self.output_path,
                )
            )
Exemplo n.º 4
0
def _finish_upload(data: Dict[str, Any]) -> Dict[str, Any]:
    """Create an UploadedFile by moving data out of tusd's bucket.

    Return kwargs for SetStepParams.
    """
    # SECURITY: we expect metadata to come from Workbench itself. (On
    # production, there's no route from the Internet to tusd's POST endpoint.)
    # However, let's cast to correct types just to be safe. If a miscreant
    # comes along, that'll cause a 500 error and we'll be notified. (Better
    # than sending untrusted data to Django ORM.)
    # Raise TypeError, KeyError, ValueError.
    filename = str(data["MetaData"]["filename"])
    api_token = str(data["MetaData"]["apiToken"])
    workflow_id = int(data["MetaData"]["workflowId"])
    step_slug = data["MetaData"]["stepSlug"]
    size = int(data["Size"])
    bucket = str(data["Storage"]["Bucket"])
    key = str(data["Storage"]["Key"])

    if bucket != s3.TusUploadBucket:
        # security: if a hijacker manages to craft a request here, prevent its
        # creator from copying a file he/she can't see. (The creator is only
        # known to be able to see `key` of `s3.TusUploadBucket`.)
        raise RuntimeError("SECURITY: did tusd send this request?")

    suffix = PurePath(filename).suffix
    file_uuid = str(uuid.uuid4())
    final_key = None

    with upload.locked_and_loaded_step(workflow_id, step_slug) as (
        workflow,
        step,
        param_id_name,
    ):  # raise UploadError
        # Ensure upload's API token is the same as the one we sent tusd.
        #
        # This doesn't give security: an attacker can simulate a request from
        # tusd with api_token=None and it will look like a browser-initiated
        # one.
        #
        # It's for timing: if the user resets a module's API token, we should
        # disallow all prior uploads.
        if api_token:  # empty when React client uploads
            upload.raise_if_api_token_is_wrong(step, api_token)  # raise UploadError

        final_key = step.uploaded_file_prefix + str(file_uuid) + suffix

        # Tricky leak here: if there's an exception or crash, the transaction
        # is reverted. final_key will remain in S3 but the database won't point
        # to it.
        #
        # Not a huge deal, because `final_key` is in the Step's own directory.
        # The user can delete all leaked files by deleting the Step.
        s3.copy(
            s3.UserFilesBucket,
            final_key,
            f"{bucket}/{key}",
            MetadataDirective="REPLACE",
            ContentDisposition=s3.encode_content_disposition(filename),
            ContentType="application/octet-stream",
        )

        step.uploaded_files.create(
            name=filename, size=size, uuid=file_uuid, key=final_key
        )
        delete_old_files_to_enforce_storage_limits(step=step)
        s3.remove(bucket, key)

    return dict(
        workflow_id=workflow_id, step=step, new_values={param_id_name: file_uuid}
    )
Exemplo n.º 5
0
def auto_delete_file_on_delete(sender, instance, **kwargs):
    # Delete S3 data when UploadedFile is deleted
    s3.remove(s3.UserFilesBucket, instance.key)
Exemplo n.º 6
0
def _clear() -> None:
    s3.remove(Bucket, Key)