예제 #1
0
 def _file_on_s3(self, relpath):
     path = Path(__file__).parent / 'test_data' / relpath
     try:
         minio.fput_file(bucket, key, path)
         yield
     finally:
         minio.remove(bucket, key)
예제 #2
0
    def test_value_counts_missing_parquet_file(self):
        # https://www.pivotaltracker.com/story/show/161988744
        crr = self.wf_module2.cache_render_result(
            2,
            ProcessResult(
                pd.DataFrame({
                    'A': ['a', 'b', 'b', 'a', 'c', np.nan],
                    'B': ['x', 'x', 'x', 'x', 'x', 'x'],
                })))
        self.wf_module2.save()

        # Simulate a race: we're overwriting the cache or deleting the WfModule
        # or some-such.
        minio.remove(minio.CachedRenderResultsBucket, crr.parquet_key)

        response = self.client.get(
            f'/api/wfmodules/{self.wf_module2.id}/value-counts?column=A')

        # We _could_ return an empty result set; but our only goal here is
        # "don't crash" and this 404 seems to be the simplest implementation.
        # (We assume that if the data is deleted, the user has moved elsewhere
        # and this response is going to be ignored.)
        self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
        self.assertEqual(json.loads(response.content),
                         {'error': 'column "A" not found'})
예제 #3
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.NUMBER(format="{:,d}")),
            Column("B", ColumnType.DATETIME()),
            Column("C", ColumnType.TEXT()),
            Column("D", ColumnType.TEXT()),
        ]
        result = ProcessResult(
            dataframe=pandas.DataFrame(
                {
                    "A": [1],  # int64
                    "B": [datetime.datetime(2018, 8, 20)],  # datetime64[ns]
                    "C": ["foo"],  # str
                    "D": pandas.Series(["cat"], dtype="category"),
                }
            ),
            columns=columns,
        )
        cached_result = self.wf_module.cache_render_result(self.delta.id, result)

        # cache_render_result() keeps its `result` parameter in memory, so we
        # can avoid disk entirely. Prove it by deleting from disk.
        minio.remove(minio.CachedRenderResultsBucket, cached_result.parquet_key)

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result
        self.assertFalse(hasattr(cached_result, "_result"))

        self.assertEqual(cached_result.nrows, 1)
        self.assertEqual(cached_result.columns, columns)
예제 #4
0
    def test_result_and_metadata_come_from_memory_when_available(self):
        columns = [
            Column("A", ColumnType.NUMBER(format="{:,d}")),
            Column("B", ColumnType.DATETIME()),
            Column("C", ColumnType.TEXT()),
            Column("D", ColumnType.TEXT()),
        ]
        result = ProcessResult(
            dataframe=pandas.DataFrame(
                {
                    "A": [1],  # int64
                    "B": [datetime.datetime(2018, 8, 20)],  # datetime64[ns]
                    "C": ["foo"],  # str
                    "D": pandas.Series(["cat"], dtype="category"),
                }
            ),
            columns=columns,
        )
        cached_result = self.wf_module.cache_render_result(self.delta.id, result)

        # cache_render_result() keeps its `result` parameter in memory, so we
        # can avoid disk entirely. Prove it by deleting from disk.
        minio.remove(minio.CachedRenderResultsBucket, cached_result.parquet_key)
        self.assertFalse(cached_result._result is None)

        self.assertEqual(cached_result.result, result)
        self.assertEqual(cached_result.nrows, 1)
        self.assertEqual(cached_result.columns, columns)
예제 #5
0
 def test_empty_categorical_has_object_dtype(self):
     expected = pd.DataFrame({'A': []}, dtype='str').astype('category')
     assert expected['A'].cat.categories.dtype == object
     try:
         parquet.write(bucket, key, expected)
         result = parquet.read(bucket, key)
     finally:
         minio.remove(bucket, key)
     assert_frame_equal(result, expected)
예제 #6
0
 def test_na_only_categorical_has_object_dtype(self):
     # Start with a Categorical with no values. (In Workbench, all
     # Categoricals are text.)
     expected = pd.DataFrame({'A': [np.nan]}, dtype=str).astype('category')
     assert expected['A'].cat.categories.dtype == object
     try:
         parquet.write(bucket, key, expected)
         result = parquet.read(bucket, key)
     finally:
         minio.remove(bucket, key)
     assert_frame_equal(result, expected)
예제 #7
0
def _delete_from_s3_pre_delete(sender, instance, **kwargs):
    """
    Delete file from S3, pre-delete.

    Why pre-delete and not post-delete? Because our user expects the file to be
    _gone_, completely, forever -- that's what "delete" means to the user. If
    deletion fails, we need the link to remain in our database -- that's how
    the user will know it isn't deleted.
    """
    if instance.bucket and instance.key:
        minio.remove(instance.bucket, instance.key)
예제 #8
0
    def test_wf_module_render_missing_parquet_file(self):
        # https://www.pivotaltracker.com/story/show/161988744
        crr = self.wf_module2.cache_render_result(2, ProcessResult(test_data))
        self.wf_module2.save()

        # Simulate a race: we're overwriting the cache or deleting the WfModule
        # or some-such.
        minio.remove(minio.CachedRenderResultsBucket, crr.parquet_key)

        response = self.client.get("/api/wfmodules/%d/render" % self.wf_module2.id)
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(
            json.loads(response.content), {"end_row": 0, "rows": [], "start_row": 0}
        )
예제 #9
0
def delete_upload_stored_objects(apps, _):
    from server import minio
    StoredObject = apps.get_model('server', 'StoredObject')

    for obj in (StoredObject.objects.filter(
            wf_module__module_id_name='upload').all()):
        try:
            minio.remove(obj.bucket, obj.key)
        except FileNotFoundError:
            # We're resuming, or the file never existed anyway. (We never
            # designed for errors, and we changed error behavior over time;
            # it's possible some uploads never had data.)
            pass
        obj.delete()
예제 #10
0
    def delete_s3_data(self):
        """
        Delete all data from S3 that is part of this upload.

        Call this within a Workflow.cooperative_lock().

        This always leaves S3 and the database in a consistent state.
        """
        key = self.get_upload_key()
        minio.abort_multipart_uploads_by_prefix(self.Bucket, key)
        minio.remove(self.Bucket, key)

        if not self.wf_module.uploaded_files.filter(uuid=str(self.id)).count():
            # If there's no UploadedFile even though we copied this file to where the
            # UploadedFile _should_ point, then we've leaked that copy. Delete. See
            # "tricky leak here" in convert_to_uploaded_file().
            final_key_prefix = self.wf_module.uploaded_file_prefix + str(
                self.id)
            # no ".xlsx"-type suffix
            minio.remove_by_prefix(minio.UserFilesBucket, final_key_prefix)
def move_uploaded_file(workflow, wf_module, uploaded_file):
    """
    Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext.

    This helps delete leaked files and find problem files.
    """
    from server import minio

    bucket = uploaded_file.bucket
    old_key = uploaded_file.key
    if '/' in old_key:
        return

    new_key = f'wf-{workflow.id}/wfm-{wf_module.id}/{old_key}'

    logger.info(f'Move %s/%s to %s/%s', bucket, old_key, bucket, new_key)
    try:
        minio.copy(bucket, new_key, f'{bucket}/{old_key}')
        minio.remove(bucket, old_key)
    except minio.error.NoSuchKey:
        # old_key is missing. Two possibilities:
        #
        # 1. We're re-running this script after it failed once with
        #    atomic=True (which used to be set, by accident); the move already
        #    succeeded but the DB doesn't know it. In that case, continue
        #    because this error actually means, "all is well."
        # 2. The file didn't exist to begin with. In that case, write a blank
        #    file in its stead. That way the user will remark, "hey, Workbench
        #    ate my file!" instead of undefined behavior (which is worse).
        #    https://www.pivotaltracker.com/story/show/163336822
        if minio.exists(bucket, new_key):
            pass  # "all is well"
        else:
            # write an empty file
            minio.put_bytes(bucket, new_key, b'')
            uploaded_file.size = 0
            uploaded_file.save(update_fields=['size'])
    uploaded_file.key = new_key
    uploaded_file.save(update_fields=['key'])
예제 #12
0
    def abort_inprogress_upload(self):
        """
        Delete data from S3 marked as in-progress uploads by  `wf_module`.

        * Delete incomplete multi-part upload
        * Delete completed upload, multipart or otherwise
        * Set `.inprogress_file_upload_*` to `None` (and save those fields)
        * Never raise `NoSuchUpload` or `FileNotFoundError`.
        """
        if (not self.inprogress_file_upload_id
                and not self.inprogress_file_upload_key):
            return

        if self.inprogress_file_upload_id:
            # If we're uploading a multipart file, delete all parts
            try:
                minio.abort_multipart_upload(minio.UserFilesBucket,
                                             self.inprogress_file_upload_key,
                                             self.inprogress_file_upload_id)
            except minio.error.NoSuchUpload:
                pass
        if self.inprogress_file_upload_key:
            # If we _nearly_ completed a multipart upload, or if we wrote data via
            # regular upload but didn't mark it completed, delete the file
            try:
                minio.remove(minio.UserFilesBucket,
                             self.inprogress_file_upload_key)
            except FileNotFoundError:
                pass
        self.inprogress_file_upload_id = None
        self.inprogress_file_upload_key = None
        self.inprogress_file_upload_last_accessed_at = None
        self.save(update_fields=[
            'inprogress_file_upload_id', 'inprogress_file_upload_key',
            'inprogress_file_upload_last_accessed_at'
        ])
예제 #13
0
def _clear() -> None:
    try:
        minio.remove(Bucket, Key)
    except minio.error.NoSuchKey:
        pass
예제 #14
0
def _delete_uploaded_file(uploaded_file):
    minio.remove(uploaded_file.bucket, uploaded_file.key)
예제 #15
0
def auto_delete_file_on_delete(sender, instance, **kwargs):
    # Delete S3 data when UploadedFile is deleted
    minio.remove(instance.bucket, instance.key)