def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def _delete_from_s3_pre_delete(sender, instance, **kwargs): """Delete file from S3, pre-delete. Why pre-delete and not post-delete? Because our user expects the file to be _gone_, completely, forever -- that's what "delete" means to the user. If deletion fails, we need the link to remain in our database -- that's how the user will know it isn't deleted. """ if instance.key: s3.remove(s3.StoredObjectsBucket, instance.key)
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, step.id, path) step.stored_data_version = so.stored_at step.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. s3.remove(s3.StoredObjectsBucket, so.key) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, python_code=textwrap.dedent( """ import pandas as pd def render(table, params, *, fetch_result, **kwargs): assert fetch_result is None return pd.DataFrame() """ ), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( chroot_context=self.chroot_context, workflow=workflow, step=step, module_zipfile=module_zipfile, params={}, tab_name=tab.name, input_path=self.empty_table_path, input_table_columns=[], tab_results={}, output_path=self.output_path, ) )
def _finish_upload(data: Dict[str, Any]) -> Dict[str, Any]: """Create an UploadedFile by moving data out of tusd's bucket. Return kwargs for SetStepParams. """ # SECURITY: we expect metadata to come from Workbench itself. (On # production, there's no route from the Internet to tusd's POST endpoint.) # However, let's cast to correct types just to be safe. If a miscreant # comes along, that'll cause a 500 error and we'll be notified. (Better # than sending untrusted data to Django ORM.) # Raise TypeError, KeyError, ValueError. filename = str(data["MetaData"]["filename"]) api_token = str(data["MetaData"]["apiToken"]) workflow_id = int(data["MetaData"]["workflowId"]) step_slug = data["MetaData"]["stepSlug"] size = int(data["Size"]) bucket = str(data["Storage"]["Bucket"]) key = str(data["Storage"]["Key"]) if bucket != s3.TusUploadBucket: # security: if a hijacker manages to craft a request here, prevent its # creator from copying a file he/she can't see. (The creator is only # known to be able to see `key` of `s3.TusUploadBucket`.) raise RuntimeError("SECURITY: did tusd send this request?") suffix = PurePath(filename).suffix file_uuid = str(uuid.uuid4()) final_key = None with upload.locked_and_loaded_step(workflow_id, step_slug) as ( workflow, step, param_id_name, ): # raise UploadError # Ensure upload's API token is the same as the one we sent tusd. # # This doesn't give security: an attacker can simulate a request from # tusd with api_token=None and it will look like a browser-initiated # one. # # It's for timing: if the user resets a module's API token, we should # disallow all prior uploads. if api_token: # empty when React client uploads upload.raise_if_api_token_is_wrong(step, api_token) # raise UploadError final_key = step.uploaded_file_prefix + str(file_uuid) + suffix # Tricky leak here: if there's an exception or crash, the transaction # is reverted. final_key will remain in S3 but the database won't point # to it. # # Not a huge deal, because `final_key` is in the Step's own directory. # The user can delete all leaked files by deleting the Step. s3.copy( s3.UserFilesBucket, final_key, f"{bucket}/{key}", MetadataDirective="REPLACE", ContentDisposition=s3.encode_content_disposition(filename), ContentType="application/octet-stream", ) step.uploaded_files.create( name=filename, size=size, uuid=file_uuid, key=final_key ) delete_old_files_to_enforce_storage_limits(step=step) s3.remove(bucket, key) return dict( workflow_id=workflow_id, step=step, new_values={param_id_name: file_uuid} )
def auto_delete_file_on_delete(sender, instance, **kwargs): # Delete S3 data when UploadedFile is deleted s3.remove(s3.UserFilesBucket, instance.key)
def _clear() -> None: s3.remove(Bucket, Key)