def _file_on_s3(self, relpath): path = Path(__file__).parent / 'test_data' / relpath try: minio.fput_file(bucket, key, path) yield finally: minio.remove(bucket, key)
def test_value_counts_missing_parquet_file(self): # https://www.pivotaltracker.com/story/show/161988744 crr = self.wf_module2.cache_render_result( 2, ProcessResult( pd.DataFrame({ 'A': ['a', 'b', 'b', 'a', 'c', np.nan], 'B': ['x', 'x', 'x', 'x', 'x', 'x'], }))) self.wf_module2.save() # Simulate a race: we're overwriting the cache or deleting the WfModule # or some-such. minio.remove(minio.CachedRenderResultsBucket, crr.parquet_key) response = self.client.get( f'/api/wfmodules/{self.wf_module2.id}/value-counts?column=A') # We _could_ return an empty result set; but our only goal here is # "don't crash" and this 404 seems to be the simplest implementation. # (We assume that if the data is deleted, the user has moved elsewhere # and this response is going to be ignored.) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) self.assertEqual(json.loads(response.content), {'error': 'column "A" not found'})
def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.DATETIME()), Column("C", ColumnType.TEXT()), Column("D", ColumnType.TEXT()), ] result = ProcessResult( dataframe=pandas.DataFrame( { "A": [1], # int64 "B": [datetime.datetime(2018, 8, 20)], # datetime64[ns] "C": ["foo"], # str "D": pandas.Series(["cat"], dtype="category"), } ), columns=columns, ) cached_result = self.wf_module.cache_render_result(self.delta.id, result) # cache_render_result() keeps its `result` parameter in memory, so we # can avoid disk entirely. Prove it by deleting from disk. minio.remove(minio.CachedRenderResultsBucket, cached_result.parquet_key) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertFalse(hasattr(cached_result, "_result")) self.assertEqual(cached_result.nrows, 1) self.assertEqual(cached_result.columns, columns)
def test_result_and_metadata_come_from_memory_when_available(self): columns = [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.DATETIME()), Column("C", ColumnType.TEXT()), Column("D", ColumnType.TEXT()), ] result = ProcessResult( dataframe=pandas.DataFrame( { "A": [1], # int64 "B": [datetime.datetime(2018, 8, 20)], # datetime64[ns] "C": ["foo"], # str "D": pandas.Series(["cat"], dtype="category"), } ), columns=columns, ) cached_result = self.wf_module.cache_render_result(self.delta.id, result) # cache_render_result() keeps its `result` parameter in memory, so we # can avoid disk entirely. Prove it by deleting from disk. minio.remove(minio.CachedRenderResultsBucket, cached_result.parquet_key) self.assertFalse(cached_result._result is None) self.assertEqual(cached_result.result, result) self.assertEqual(cached_result.nrows, 1) self.assertEqual(cached_result.columns, columns)
def test_empty_categorical_has_object_dtype(self): expected = pd.DataFrame({'A': []}, dtype='str').astype('category') assert expected['A'].cat.categories.dtype == object try: parquet.write(bucket, key, expected) result = parquet.read(bucket, key) finally: minio.remove(bucket, key) assert_frame_equal(result, expected)
def test_na_only_categorical_has_object_dtype(self): # Start with a Categorical with no values. (In Workbench, all # Categoricals are text.) expected = pd.DataFrame({'A': [np.nan]}, dtype=str).astype('category') assert expected['A'].cat.categories.dtype == object try: parquet.write(bucket, key, expected) result = parquet.read(bucket, key) finally: minio.remove(bucket, key) assert_frame_equal(result, expected)
def _delete_from_s3_pre_delete(sender, instance, **kwargs): """ Delete file from S3, pre-delete. Why pre-delete and not post-delete? Because our user expects the file to be _gone_, completely, forever -- that's what "delete" means to the user. If deletion fails, we need the link to remain in our database -- that's how the user will know it isn't deleted. """ if instance.bucket and instance.key: minio.remove(instance.bucket, instance.key)
def test_wf_module_render_missing_parquet_file(self): # https://www.pivotaltracker.com/story/show/161988744 crr = self.wf_module2.cache_render_result(2, ProcessResult(test_data)) self.wf_module2.save() # Simulate a race: we're overwriting the cache or deleting the WfModule # or some-such. minio.remove(minio.CachedRenderResultsBucket, crr.parquet_key) response = self.client.get("/api/wfmodules/%d/render" % self.wf_module2.id) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( json.loads(response.content), {"end_row": 0, "rows": [], "start_row": 0} )
def delete_upload_stored_objects(apps, _): from server import minio StoredObject = apps.get_model('server', 'StoredObject') for obj in (StoredObject.objects.filter( wf_module__module_id_name='upload').all()): try: minio.remove(obj.bucket, obj.key) except FileNotFoundError: # We're resuming, or the file never existed anyway. (We never # designed for errors, and we changed error behavior over time; # it's possible some uploads never had data.) pass obj.delete()
def delete_s3_data(self): """ Delete all data from S3 that is part of this upload. Call this within a Workflow.cooperative_lock(). This always leaves S3 and the database in a consistent state. """ key = self.get_upload_key() minio.abort_multipart_uploads_by_prefix(self.Bucket, key) minio.remove(self.Bucket, key) if not self.wf_module.uploaded_files.filter(uuid=str(self.id)).count(): # If there's no UploadedFile even though we copied this file to where the # UploadedFile _should_ point, then we've leaked that copy. Delete. See # "tricky leak here" in convert_to_uploaded_file(). final_key_prefix = self.wf_module.uploaded_file_prefix + str( self.id) # no ".xlsx"-type suffix minio.remove_by_prefix(minio.UserFilesBucket, final_key_prefix)
def move_uploaded_file(workflow, wf_module, uploaded_file): """ Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext. This helps delete leaked files and find problem files. """ from server import minio bucket = uploaded_file.bucket old_key = uploaded_file.key if '/' in old_key: return new_key = f'wf-{workflow.id}/wfm-{wf_module.id}/{old_key}' logger.info(f'Move %s/%s to %s/%s', bucket, old_key, bucket, new_key) try: minio.copy(bucket, new_key, f'{bucket}/{old_key}') minio.remove(bucket, old_key) except minio.error.NoSuchKey: # old_key is missing. Two possibilities: # # 1. We're re-running this script after it failed once with # atomic=True (which used to be set, by accident); the move already # succeeded but the DB doesn't know it. In that case, continue # because this error actually means, "all is well." # 2. The file didn't exist to begin with. In that case, write a blank # file in its stead. That way the user will remark, "hey, Workbench # ate my file!" instead of undefined behavior (which is worse). # https://www.pivotaltracker.com/story/show/163336822 if minio.exists(bucket, new_key): pass # "all is well" else: # write an empty file minio.put_bytes(bucket, new_key, b'') uploaded_file.size = 0 uploaded_file.save(update_fields=['size']) uploaded_file.key = new_key uploaded_file.save(update_fields=['key'])
def abort_inprogress_upload(self): """ Delete data from S3 marked as in-progress uploads by `wf_module`. * Delete incomplete multi-part upload * Delete completed upload, multipart or otherwise * Set `.inprogress_file_upload_*` to `None` (and save those fields) * Never raise `NoSuchUpload` or `FileNotFoundError`. """ if (not self.inprogress_file_upload_id and not self.inprogress_file_upload_key): return if self.inprogress_file_upload_id: # If we're uploading a multipart file, delete all parts try: minio.abort_multipart_upload(minio.UserFilesBucket, self.inprogress_file_upload_key, self.inprogress_file_upload_id) except minio.error.NoSuchUpload: pass if self.inprogress_file_upload_key: # If we _nearly_ completed a multipart upload, or if we wrote data via # regular upload but didn't mark it completed, delete the file try: minio.remove(minio.UserFilesBucket, self.inprogress_file_upload_key) except FileNotFoundError: pass self.inprogress_file_upload_id = None self.inprogress_file_upload_key = None self.inprogress_file_upload_last_accessed_at = None self.save(update_fields=[ 'inprogress_file_upload_id', 'inprogress_file_upload_key', 'inprogress_file_upload_last_accessed_at' ])
def _clear() -> None: try: minio.remove(Bucket, Key) except minio.error.NoSuchKey: pass
def _delete_uploaded_file(uploaded_file): minio.remove(uploaded_file.bucket, uploaded_file.key)
def auto_delete_file_on_delete(sender, instance, **kwargs): # Delete S3 data when UploadedFile is deleted minio.remove(instance.bucket, instance.key)