示例#1
0
def migrate_file(f):
    file_id = f['fileinfo'].get('_id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.get_file_info(file_id, file_path):
            log.debug(
                '    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            old_file = src_fs.open(file_id, file_path, 'rb')
            new_file = target_fs.open(file_id, file_path, 'wb')
            buffer_copy(old_file, new_file, CHUNK_SIZE)
            old_file.close()
            new_file.close()
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        log.debug('    generated uuid: %s', file_id)
        f_old_path = util.path_from_hash(f['fileinfo']['hash'])
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')
        old_file = local_fs.open(None, f_old_path, 'rb')
        new_file = target_fs.open(file_id, f_new_path, 'wb')
        buffer_copy(old_file, new_file, CHUNK_SIZE)
        old_file.close()
        new_file.close()

        update_set = {
            f['prefix'] + '.$.modified': datetime.datetime.utcnow(),
            f['prefix'] + '.$._id': file_id
        }

        # Update the file with the newly generated UUID
        updated_doc = db[f['container']].find_one_and_update(
            {
                '_id': f['container_id'],
                f['prefix'] + '.name': f['fileinfo']['name'],
                f['prefix'] + '.hash': f['fileinfo']['hash']
            }, {'$set': update_set})

        if not updated_doc:
            log.info(
                'Probably the following file has been updated during the migration '
                'and its hash is changed, cleaning up from the new filesystem')
            target_fs.remove_file(file_id, f_new_path)
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form):
    # Create a project
    session_id = data_builder.create_session()

    files = []

    # Create an UUID file
    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_1, file_content_1)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_1})['files'][0]
    file_id_1 = file_info['_id']
    url_1 = '/sessions/' + session_id + '/files/' + file_name_1

    files.append(
        (session_id, file_name_1, url_1, util.path_from_uuid(file_id_1),
         str(file_info['provider_id']), file_id_1))

    yield files

    # Clean up, get the files
    files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files']
    # Delete the files but the session still exists in the DB with now missing data
    for f in files:
        try:
            source_fs = get_provider(f['provider_id']).storage_plugin
            source_fs.remove_file(f['_id'], None)
        except:
            pass
示例#3
0
def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form):
    """Yield legacy CAS file"""
    project = data_builder.create_project()
    file_name = '%s.csv' % randstr()
    file_content = randstr()
    as_admin.post('/projects/' + project + '/files',
                  files=file_form((file_name, file_content)))

    file_info = api_db['projects'].find_one({'files.name':
                                             file_name})['files'][0]
    file_id = file_info['_id']
    file_hash = file_info['hash']
    # verify cas backward compatibility
    api_db['projects'].find_one_and_update({'files.name': file_name},
                                           {'$unset': {
                                               'files.$._id': ''
                                           }})

    file_path = unicode(util.path_from_hash(file_hash))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.exists(target_dir):
        config.local_fs.makedirs(target_dir)
    fs.move.move_file(src_fs=config.fs,
                      src_path=util.path_from_uuid(file_id),
                      dst_fs=config.local_fs,
                      dst_path=file_path)

    yield (project, file_name, file_content)

    # clean up
    config.local_fs.remove(file_path)
    config.local_fs.removetree(target_dir)
    api_db['projects'].delete_one({'_id': project})
def move_file(src_storage, src_id, dst_storage, dst_path):
    src_path = util.path_from_uuid(src_id)
    target_dir = fs.path.dirname(dst_path)
    with src_storage.storage_plugin.open(
            src_id, src_path, 'rb') as src_fp, dst_storage.storage_plugin.open(
                None, dst_path, 'wb') as dst_fp:
        shutil.copyfileobj(src_fp, dst_fp)
    src_storage.storage_plugin.remove_file(src_id, src_path)
示例#5
0
def migrate_file(f):
    file_id = f['fileinfo'].get('_id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.isfile(file_path):
            log.debug('    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            dst_dir = fs.path.dirname(file_path)
            target_fs.makedirs(dst_dir, recreate=True)
            fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path)
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        log.debug('    generated uuid: %s', file_id)
        f_old_path = util.path_from_hash(f['fileinfo']['hash'])
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')
        dst_dir = fs.path.dirname(f_new_path)
        target_fs.makedirs(dst_dir, recreate=True)
        fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path)

        update_set = {
            f['prefix'] + '.$.modified': datetime.datetime.utcnow(),
            f['prefix'] + '.$._id': file_id
        }

        # Update the file with the newly generated UUID
        updated_doc = db[f['container']].find_one_and_update(
            {'_id': f['container_id'],
             f['prefix'] + '.name': f['fileinfo']['name'],
             f['prefix'] + '.hash': f['fileinfo']['hash']},
            {'$set': update_set}
        )

        if not updated_doc:
            log.info('Probably the following file has been updated during the migration '
                     'and its hash is changed, cleaning up from the new filesystem')
            target_fs.remove(f_new_path)
示例#6
0
def move_file(src_id, dst_storage, dst_path):
    dst_fs = dst_storage.get_fs()
    src_path = util.path_from_uuid(src_id)
    target_dir = fs.path.dirname(dst_path)
    if not dst_fs.exists(target_dir):
        dst_fs.makedirs(target_dir)
    with config.primary_storage.open(src_id, src_path,
                                     'rb') as src_fp, dst_fs.open(
                                         dst_path, 'wb') as dst_fp:
        shutil.copyfileobj(src_fp, dst_fp)
    config.primary_storage.remove_file(src_id, src_path)
示例#7
0
def migrate_gear_files(f):
    file_id = f['exchange'].get('rootfs-id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.get_file_info(file_id, file_path):
            log.debug(
                '    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            old_file = src_fs.open(file_id, file_path, 'rb')
            new_file = target_fs.open(file_id, file_path, 'wb')
            buffer_copy(old_file, new_file, CHUNK_SIZE)
            old_file.close()
            new_file.close()
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-')
        f_old_path = util.path_from_hash(file_hash)
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')

        old_file = local_fs.open(None, f_old_path, 'rb')
        new_file = target_fs.open(file_id, f_new_path, 'wb')
        buffer_copy(old_file, new_file, CHUNK_SIZE)
        old_file.close()
        new_file.close()

        update_set = {
            'modified': datetime.datetime.utcnow(),
            'exchange.rootfs-id': file_id
        }

        # Update the gear with the newly generated UUID
        db['gears'].find_one_and_update({'_id': f['gear_id']},
                                        {'$set': update_set})
示例#8
0
def test_file_replaced_handling(files_to_migrate, migrate_storage, as_admin,
                                file_form, api_db, mocker, caplog):

    origin_find_one_and_update = pymongo.collection.Collection.find_one_and_update

    def mocked(*args, **kwargs):
        self = args[0]
        filter = args[1]
        update = args[2]

        as_admin.post('/sessions/' + session_id + '/files',
                      files=file_form((file_name_1, 'new_content')))

        return origin_find_one_and_update(self, filter, update)

    with mocker.mock_module.patch.object(pymongo.collection.Collection,
                                         'find_one_and_update', mocked):
        # get file storing by hash in legacy storage
        (session_id, file_name_1, url_1, file_path_1) = files_to_migrate[0]
        # get ile storing by uuid in legacy storage
        (_, file_name_2, url_2, file_path_2) = files_to_migrate[1]

        # run the migration
        migrate_storage.main('--containers')

        file_1_id = api_db['sessions'].find_one({'files.name': file_name_1
                                                 })['files'][0]['_id']

        file_2_id = api_db['sessions'].find_one({'files.name': file_name_2
                                                 })['files'][1]['_id']

        assert config.primary_storage.get_file_info(
            file_1_id, util.path_from_uuid(file_1_id)) is not None
        assert config.primary_storage.get_file_info(
            file_2_id, util.path_from_uuid(file_2_id)) is not None

    assert any(
        log.message ==
        'Probably the following file has been updated during the migration and its hash is changed, cleaning up from the new filesystem'
        for log in caplog.records)
示例#9
0
def migrate_gear_files(f):
    file_id = f['exchange'].get('rootfs-id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.isfile(file_path):
            log.debug('    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            dst_dir = fs.path.dirname(file_path)
            target_fs.makedirs(dst_dir, recreate=True)
            fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path)
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-')
        f_old_path = util.path_from_hash(file_hash)
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')

        dst_dir = fs.path.dirname(f_new_path)
        target_fs.makedirs(dst_dir, recreate=True)
        fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path)

        update_set = {
            'modified': datetime.datetime.utcnow(),
            'exchange.rootfs-id': file_id
        }

        # Update the gear with the newly generated UUID
        db['gears'].find_one_and_update(
            {'_id': f['gear_id']},
            {'$set': update_set}
        )
示例#10
0
def cleanup_files(remove_all, origins, project_id, job_phi):
    log.info(
        'Cleanup deleted container (projects, acquisitions, sessions, collections, analyses) files...'
    )

    deleted_date_cutoff = datetime.datetime.now() - datetime.timedelta(
        hours=72)
    container_ids = []

    for container in cont_names:
        log.info("Cleaning up %s" % container)

        pipeline = [
            {
                "$match": {
                    "$or": [{
                        "files.deleted": {
                            "$lte": deleted_date_cutoff
                        }
                    }, {
                        "deleted": {
                            "$lte": deleted_date_cutoff
                        }
                    }]
                }
            },
            {
                "$project": {
                    "files": {
                        "$ifNull": [
                            {
                                "$filter": {
                                    "input": "$files",
                                    "as": "item",
                                    "cond": {
                                        "$or": [
                                            {
                                                "$and": [
                                                    # $lte return true if the deleted field not exists
                                                    {
                                                        "$lte": [
                                                            "$$item.deleted",
                                                            deleted_date_cutoff
                                                        ]
                                                    },
                                                    {
                                                        "$ifNull": [
                                                            "$$item.deleted",
                                                            False
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                "$and": [
                                                    # $lte return true if the deleted field not exists
                                                    {
                                                        "$lte": [
                                                            "$deleted",
                                                            deleted_date_cutoff
                                                        ]
                                                    },
                                                    {
                                                        "$ifNull":
                                                        ["$deleted", False]
                                                    }
                                                ]
                                            }
                                        ]
                                    }
                                }
                            },
                            []
                        ]
                    },
                    "deleted": 1
                }
            }
        ]
        if project_id:
            # Use the id field or parents.project field to filter results
            # instead of date of deletion
            project_filter = {
                '$or': [{
                    '_id': bson.ObjectId(project_id)
                }, {
                    'parents.project': bson.ObjectId(project_id)
                }]
            }

            # We don't care about time of deletetion for single project snipes
            pipeline[0]['$match'].pop('$or')
            deleted_filter = {
                '$or': [{
                    'files.deleted': {
                        '$exists': True
                    }
                }, {
                    'deleted': {
                        '$exists': True
                    }
                }]
            }

            pipeline[0]['$match']['$and'] = [deleted_filter, project_filter]
            pipeline[1]['$project'] = {'files': 1, 'deleted': 1}

        cursor = db.get_collection(container).aggregate(pipeline)
        job_operations = []
        job_log_operations = []
        jobs_modified = 0
        job_logs_deleted = 0

        for document in cursor:
            document_deleted = False

            if project_id and job_phi:
                # Append the container id to the list to purge jobs of phi
                container_ids.append(document['_id'])
                if document.get('deleted'):
                    # if the document is deleted, remove it from the database
                    # since it might have phi from engine uploads
                    # NOTE: we only do this if job-phi is also set so that we can if needed,
                    # go back and delete the job phi
                    response = db.get_collection(container).delete_one(
                        {'_id': document['_id']})
                    document_deleted = response.deleted_count == 1

            for i, f in enumerate(document.get('files', [])):
                if not remove_all and f['origin']['type'] not in origins:
                    log.debug(
                        '  skipping %s/%s/%s since it was uploaded by %s',
                        container, document['_id'], f['name'],
                        f['origin']['type'])
                    continue

                log.debug(
                    '  file marked to delete: %s, parent marked to delete: %s',
                    f.get('deleted', False), document.get('deleted', False))
                log.debug('  removing %s/%s/%s', container, document['_id'],
                          f['name'])

                if f.get('_id'):
                    uuid_path = util.path_from_uuid(f['_id'])
                    if fs.get_file_info(f['_id'], uuid_path):
                        log.debug('    removing from %s', fs)
                        fs.remove_file(f['_id'], uuid_path)

                    if not document_deleted:
                        # only need to remove the file from the database
                        # if the document wasn't already removed
                        log.debug('    removing from database')
                        update_result = db.get_collection(
                            container).update_one(
                                {'_id': document['_id']},
                                {'$pull': {
                                    'files': {
                                        '_id': f['_id']
                                    }
                                }})
                        if not update_result.modified_count == 1:
                            log.error(
                                '    couldn\'t remove file from database')
                            exit(1)

            if len(container_ids) == 100:
                # Chunking the number of jobs to find to
                # Number of jobs from 100 containers
                job_operations, job_log_operations = generate_job_operations(
                    container_ids)
                result = execute_job_operations(job_operations,
                                                job_log_operations)
                jobs_modified += result[0]
                job_logs_deleted += result[1]
                container_ids = []

    if container_ids:
        # find the jobs % 100 left
        job_operations, job_log_operations = generate_job_operations(
            container_ids)
        result = execute_job_operations(job_operations, job_log_operations)
        jobs_modified += result[0]
        job_logs_deleted += result[1]
        container_ids = []
    log.debug('Purged phi from %s, and removed %s jobs logs', jobs_modified,
              job_logs_deleted)
示例#11
0
def test_cleanup_single_project(data_builder, default_payload, randstr,
                                file_form, as_admin, as_drone, api_db,
                                cleanup_deleted):
    project_id = data_builder.create_project()
    session_id = data_builder.create_session()
    acquisition_id = data_builder.create_acquisition()

    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_1, file_content_1)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_1})['files'][0]
    file_id_1 = file_info['_id']

    # Create ad-hoc analysis
    r = as_admin.post('/sessions/' + session_id + '/analyses',
                      json={
                          'label':
                          'offline',
                          'inputs': [{
                              'type': 'session',
                              'id': session_id,
                              'name': file_name_1
                          }]
                      })
    assert r.ok
    analysis = r.json()['_id']

    # get the ticket
    r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1,
                     params={'ticket': ''})
    assert r.ok
    ticket = r.json()['ticket']

    # download the file
    assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1,
                        params={
                            'ticket': ticket
                        }).ok

    # run a job
    gear_doc = default_payload['gear']['gear']
    gear_doc['inputs'] = {'dicom': {'base': 'file'}}
    gear = data_builder.create_gear(gear=gear_doc)

    job_data = {
        'gear_id': gear,
        'inputs': {
            'dicom': {
                'type': 'session',
                'id': session_id,
                'name': file_name_1
            }
        },
        'config': {
            'two-digit multiple of ten': 20
        },
        'destination': {
            'type': 'acquisition',
            'id': acquisition_id
        },
        'tags': ['test-tag']
    }
    # add job with explicit destination
    r = as_admin.post('/jobs/add', json=job_data)
    assert r.ok
    job_id = r.json()['_id']

    # start job (Adds logs)
    r = as_admin.get('/jobs/next')
    assert r.ok

    # prepare completion (send success status before engine upload)
    r = as_drone.post('/jobs/' + job_id + '/prepare-complete')
    assert r.ok

    # verify that job ticket has been created
    job_ticket = api_db.job_tickets.find_one({'job': job_id})
    assert job_ticket['timestamp']

    produced_metadata = {
        'project': {
            'label': 'engine project',
            'info': {
                'test': 'p'
            }
        },
        'session': {
            'label': 'engine session',
            'subject': {
                'code': 'engine subject',
                'sex': 'male',
                'age': 86400
            },
            'info': {
                'test': 's'
            }
        },
        'acquisition': {
            'label':
            'engine acquisition',
            'timestamp':
            '2016-06-20T21:57:36+00:00',
            'info': {
                'test': 'a'
            },
            'files': [{
                'name': 'result.txt',
                'type': 'text',
                'info': {
                    'test': 'f0'
                }
            }]
        }
    }

    # engine upload
    r = as_drone.post('/engine',
                      params={
                          'level': 'acquisition',
                          'id': acquisition_id,
                          'job': job_id,
                          'job_ticket': job_ticket['_id']
                      },
                      files=file_form('result.txt', meta=produced_metadata))
    assert r.ok

    # Make sure produced metadata and logs exist
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert r.json().get('logs')

    # Try cleaning undeleted project
    cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project',
                         project_id, '--job-phi')

    # Make sure file is still there
    assert config.primary_storage.get_file_info(file_id_1,
                                                util.path_from_uuid(file_id_1))

    # Make sure job phi is still there
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert r.json().get('logs')

    # delete the project
    r = as_admin.delete('/projects/' + project_id)
    assert r.ok

    # Run cleanup again
    cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project',
                         project_id, '--job-phi')

    # Make sure file is not there
    assert not config.primary_storage.get_file_info(
        file_id_1, util.path_from_uuid(file_id_1))

    # Check job phi
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert not job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert not r.json().get('logs')

    assert not api_db.projects.find_one({'_id': ObjectId(project_id)})
    assert not api_db.subjects.find_one(
        {'parents.project': ObjectId(project_id)})
    assert not api_db.sessions.find_one(
        {'parents.project': ObjectId(project_id)})
    assert not api_db.acquisitions.find_one(
        {'parents.project': ObjectId(project_id)})
    assert not api_db.analyses.find_one(
        {'parents.project': ObjectId(project_id)})
示例#12
0
def gears_to_migrate(api_db, as_admin, randstr, file_form):
    def gen_gear_meta(gear_name):
        return {
            'gear': {
                "version": '0.0.1',
                "config": {},
                "name": gear_name,
                "inputs": {
                    "file": {
                        "base": "file",
                        "description": "Any image."
                    }
                },
                "maintainer": "Test",
                "description": "Test",
                "license": "Other",
                "author": "Test",
                "url": "http://example.example",
                "label": "Test Gear",
                "flywheel": "0",
                "source": "http://example.example"
            }
        }

    gears = []

    gear_name_1 = randstr()

    file_name = '%s.tar.gz' % randstr()
    file_content = randstr()
    r = as_admin.post('/gears/temp',
                      files=file_form((file_name, file_content),
                                      meta=gen_gear_meta(gear_name_1)))
    gear_id_1 = r.json()['_id']

    r = as_admin.get('/gears/' + gear_id_1)
    gear_json_1 = r.json()

    file_hash__1 = 'v0-' + gear_json_1['exchange']['rootfs-hash'].replace(
        ':', '-')
    file_id_1 = gear_json_1['exchange']['rootfs-id']

    file_path = unicode(util.path_from_hash(file_hash__1))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.get_fs().exists(target_dir):
        config.local_fs.get_fs().makedirs(target_dir)
    move_file(file_id_1, config.local_fs, file_path)

    api_db['gears'].find_one_and_update({'_id': ObjectId(gear_id_1)},
                                        {'$unset': {
                                            'exchange.rootfs-id': ''
                                        }})

    gears.append((gear_id_1, file_path))

    gear_name_2 = randstr()
    file_name = '%s.tar.gz' % randstr()
    file_content = randstr()
    r = as_admin.post('/gears/temp',
                      files=file_form((file_name, file_content),
                                      meta=gen_gear_meta(gear_name_2)))
    gear_id_2 = r.json()['_id']

    r = as_admin.get('/gears/' + gear_id_2)
    gear_json_2 = r.json()

    file_id_2 = gear_json_2['exchange']['rootfs-id']

    file_path = unicode(util.path_from_uuid(file_id_2))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.get_fs().exists(target_dir):
        config.local_fs._fs.makedirs(target_dir)
    move_file(file_id_2, config.local_fs, file_path)
    gears.append((gear_id_2, file_path))

    yield gears

    # clean up
    gear_json_1 = api_db['gears'].find_one({'_id': ObjectId(gear_id_1)})
    gear_json_2 = api_db['gears'].find_one({'_id': ObjectId(gear_id_2)})
    files_to_delete = []
    files_to_delete.append(
        util.path_from_uuid(gear_json_1['exchange'].get('rootfs-id', '')))
    files_to_delete.append(
        util.path_from_uuid(gear_json_1['exchange'].get('rootfs-hash', '')))
    files_to_delete.append(
        util.path_from_uuid(gear_json_2['exchange'].get('rootfs-id', '')))

    for f_path in files_to_delete:
        try:
            config.primary_storage.remove_file(None, f_path)
        except:
            pass

    api_db['gears'].delete_one({'_id': ObjectId(gear_id_1)})
    api_db['gears'].delete_one({'_id': ObjectId(gear_id_2)})
示例#13
0
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form):
    # Create a project
    session_id = data_builder.create_session()

    files = []

    # Create a CAS file
    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_1, file_content_1)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_1})['files'][0]
    file_id_1 = file_info['_id']
    file_hash_1 = file_info['hash']
    url_1 = '/sessions/' + session_id + '/files/' + file_name_1

    api_db['sessions'].find_one_and_update({'files.name': file_name_1},
                                           {'$unset': {
                                               'files.$._id': ''
                                           }})

    move_file_to_legacy(file_id_1, util.path_from_hash(file_hash_1))
    files.append(
        (session_id, file_name_1, url_1, util.path_from_hash(file_hash_1)))

    # Create an UUID file
    file_name_2 = '%s.csv' % randstr()
    file_content_2 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_2, file_content_2)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_2})['files'][1]
    file_id_2 = file_info['_id']
    url_2 = '/sessions/' + session_id + '/files/' + file_name_2

    move_file_to_legacy(file_id_2, util.path_from_uuid(file_id_2))
    files.append(
        (session_id, file_name_2, url_2, util.path_from_uuid(file_id_2)))

    ### Temp fix for 3-way split storages, see api.config.local_fs2 for details
    # Create an UUID file in legacy/v1 for testing 3-way split storage
    file_name_3 = '%s.csv' % randstr()
    file_content_3 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_3, file_content_3)))
    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_3})['files'][2]
    file_id_3 = file_info['_id']
    url_3 = '/sessions/' + session_id + '/files/' + file_name_3

    move_file_to_legacy2(file_id_3, util.path_from_uuid(file_id_3))
    files.append(
        (session_id, file_name_3, url_3, util.path_from_uuid(file_id_3)))
    ###

    yield files

    # Clean up, get the files
    files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files']
    # Delete the files
    for f in files:
        try:
            config.primary_storage.remove_file(f['_id'],
                                               util.path_from_uuid(f['_id']))
        except:
            pass
示例#14
0
def test_download_k(data_builder, file_form, as_admin, as_root, api_db, legacy_cas_file):
    project = data_builder.create_project(label='project1')
    session = data_builder.create_session(label='session1', project=project)
    session2 = data_builder.create_session(label='session1', project=project)
    session3 = data_builder.create_session(label='session1', project=project)
    session4 = data_builder.create_session(label='session/1', project=project)
    acquisition = data_builder.create_acquisition(session=session)
    acquisition2 = data_builder.create_acquisition(session=session2)
    acquisition3 = data_builder.create_acquisition(session=session3)
    acquisition4 = data_builder.create_acquisition(session=session4)

    # upload the same file to each container created and use different tags to
    # facilitate download filter tests:
    # acquisition: [], session: ['plus'], project: ['plus', 'minus']
    file_name = 'test.csv'
    as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form(
        file_name, meta={'name': file_name, 'type': 'csv'}))

    as_admin.post('/acquisitions/' + acquisition2 + '/files', files=file_form(
        file_name, meta={'name': file_name, 'type': 'csv'}))

    as_admin.post('/acquisitions/' + acquisition3 + '/files', files=file_form(
        'test.txt', meta={'name': file_name, 'type': 'text'}))

    as_admin.post('/acquisitions/' + acquisition4 + '/files', files=file_form(
        'test.txt', meta={'name': file_name, 'type': 'text'}))

    as_admin.post('/sessions/' + session + '/files', files=file_form(
        file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus']}))

    as_admin.post('/projects/' + project + '/files', files=file_form(
        file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus', 'minus']}))

    # also a deleted file to make sure it doesn't show up
    as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form(
        file_name, meta={'name': 'deleted_'+file_name, 'type': 'csv'}))
    r = as_admin.delete('/acquisitions/' + acquisition + '/files/deleted_' + file_name)
    assert r.ok

    missing_object_id = '000000000000000000000000'

    # Try to download w/ nonexistent ticket
    r = as_admin.get('/download', params={'ticket': missing_object_id})
    assert r.status_code == 404

    # Retrieve a ticket for a batch download as superuser
    r = as_root.post('/download', json={
        'optional': False,
        'filters': [{'tags': {
            '-': ['minus']
        }}],
        'nodes': [
            {'level': 'project', '_id': project},
        ]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download
    r = as_root.get('/download', params={'ticket': ticket})
    assert r.ok

    # Retrieve a ticket for a batch download
    r = as_admin.post('/download', json={
        'optional': False,
        'filters': [{'tags': {
            '-': ['minus']
        }}],
        'nodes': [
            {'level': 'project', '_id': project},
        ]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download
    r = as_admin.get('/download', params={'ticket': ticket})
    assert r.ok

    tar_file = cStringIO.StringIO(r.content)
    tar = tarfile.open(mode="r", fileobj=tar_file)

    # Verify a single file in tar with correct file name
    found_second_session = False
    found_third_session = False
    found_fourth_session = False
    for tarinfo in tar:
        assert os.path.basename(tarinfo.name) == file_name
        if 'session1_0' in str(tarinfo.name):
            found_second_session = True
        if 'session1_1' in str(tarinfo.name):
            found_third_session = True
        if 'session1_2' in str(tarinfo.name):
            found_fourth_session = True
    assert found_second_session
    assert found_third_session
    assert found_fourth_session
    tar.close()

    # Download one session with many acquisitions and make sure they are in the same subject folder

    acquisition3 = data_builder.create_acquisition(session=session)
    r = as_admin.post('/acquisitions/' + acquisition3 + '/files', files=file_form(
        file_name, meta={'name': file_name, 'type': 'csv'}))
    assert r.ok

    r = as_admin.post('/download', json={
        'optional': False,
        'nodes': [
            {'level': 'acquisition', '_id': acquisition},
            {'level': 'acquisition', '_id': acquisition3},
        ]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download
    r = as_admin.get('/download', params={'ticket': ticket})
    assert r.ok

    tar_file = cStringIO.StringIO(r.content)
    tar = tarfile.open(mode="r", fileobj=tar_file)

    # Verify a single file in tar with correct file name
    found_second_session = False
    for tarinfo in tar:
        assert os.path.basename(tarinfo.name) == file_name
        if 'session1_0' in str(tarinfo.name):
            found_second_session = True
    assert not found_second_session

    tar.close()

    # Try to perform the download from a different IP
    update_result = api_db.downloads.update_one(
        {'_id': ticket},
        {'$set': {'ip': '255.255.255.255'}})
    assert update_result.modified_count == 1

    r = as_admin.get('/download', params={'ticket': ticket})
    assert r.status_code == 400

    # Try to retrieve a ticket referencing nonexistent containers
    r = as_admin.post('/download', json={
        'optional': False,
        'nodes': [
            {'level': 'project', '_id': missing_object_id},
            {'level': 'session', '_id': missing_object_id},
            {'level': 'acquisition', '_id': missing_object_id},
        ]
    })
    assert r.status_code == 404

    # Try to retrieve ticket for bulk download w/ invalid container name
    # (not project|session|acquisition)
    r = as_admin.post('/download', params={'bulk': 'true'}, json={
        'files': [{'container_name': 'subject', 'container_id': missing_object_id, 'filename': 'nosuch.csv'}]
    })
    assert r.status_code == 400

    # Try to retrieve ticket for bulk download referencing nonexistent file
    r = as_admin.post('/download', params={'bulk': 'true'}, json={
        'files': [{'container_name': 'project', 'container_id': project, 'filename': 'nosuch.csv'}]
    })
    assert r.status_code == 404

    # Retrieve ticket for bulk download
    r = as_admin.post('/download', params={'bulk': 'true'}, json={
        'files': [{'container_name': 'project', 'container_id': project, 'filename': file_name}]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download using symlinks
    r = as_admin.get('/download', params={'ticket': ticket, 'symlinks': 'true'})
    assert r.ok

    # test legacy cas file handling
    (project_legacy, file_name_legacy, file_content) = legacy_cas_file
    r = as_admin.post('/download', json={
        'optional': False,
        'nodes': [
            {'level': 'project', '_id': project_legacy},
        ]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download
    r = as_admin.get('/download', params={'ticket': ticket})
    assert r.ok

    tar_file = cStringIO.StringIO(r.content)
    tar = tarfile.open(mode="r", fileobj=tar_file)

    # Verify a single file in tar with correct file name
    for tarinfo in tar:
        assert os.path.basename(tarinfo.name) == file_name_legacy

    tar.close()

    # test missing file hangling

    file_id = api_db.acquisitions.find_one({'_id': ObjectId(acquisition)})['files'][0]['_id']
    config.fs.remove(util.path_from_uuid(file_id))

    r = as_admin.post('/download', json={
        'optional': False,
        'nodes': [
            {'level': 'acquisition', '_id': acquisition},
            {'level': 'acquisition', '_id': acquisition3},
        ]
    })
    assert r.ok
    ticket = r.json()['ticket']

    # Perform the download
    r = as_admin.get('/download', params={'ticket': ticket})
    assert r.ok

    tar_file = cStringIO.StringIO(r.content)
    tar = tarfile.open(mode="r", fileobj=tar_file)

    # Verify a single file in tar with correct file name
    tarinfo_list = list(tar)
    # it contains two files
    assert len(tarinfo_list) == 2
    assert len([tarinfo for tarinfo in tarinfo_list if tarinfo.name.endswith('.MISSING')]) == 1

    tar.close()
示例#15
0
def cleanup_files(remove_all, origins):
    log.info('Cleanup deleted container (projects, acquisitions, sessions, collections, analyses) files...')

    d = datetime.datetime.now() - datetime.timedelta(hours=72)

    for container in cont_names:
        log.info("Cleaning up %s" % container)

        cursor = db.get_collection(container).aggregate([
            {
                "$match": {
                    "$or": [
                        {"files.deleted": {"$lte": d}},
                        {"deleted": {"$lte": d}}
                    ]
                }
            },
            {
                "$project": {
                    "files": {
                        "$ifNull": [
                            {
                                "$filter": {
                                    "input": "$files",
                                    "as": "item",
                                    "cond": {
                                        "$or": [
                                            {
                                                "$and": [
                                                    # $lte return true if the deleted field not exists
                                                    {"$lte": ["$$item.deleted", d]},
                                                    {"$ifNull": ["$$item.deleted", False]}
                                                ]
                                            },
                                            {
                                                "$and": [
                                                    # $lte return true if the deleted field not exists
                                                    {"$lte": ["$deleted", d]},
                                                    {"$ifNull": ["$deleted", False]}
                                                ]
                                            }
                                        ]
                                    }
                                }
                            },
                            []
                        ]
                    },
                    "deleted": 1
                }
            }
        ])

        for document in cursor:
            for i, f in enumerate(document.get('files', [])):
                if not remove_all and f['origin']['type'] not in origins:
                    log.debug('  skipping %s/%s/%s since it was uploaded by %s',
                              container, document['_id'], f['name'], f['origin']['type'])
                    continue

                log.debug('  file marked to delete: %s, parent marked to delete: %s',
                          f.get('deleted', False),
                          document.get('deleted', False))
                log.debug('  removing %s/%s/%s', container, document['_id'], f['name'])

                if f.get('_id'):
                    uuid_path = util.path_from_uuid(f['_id'])
                    if fs.exists(uuid_path):
                        log.debug('    removing from %s', fs)
                        fs.remove(uuid_path)

                    log.debug('    removing from database')
                    updated_doc = db.get_collection(container).update({'_id': document['_id']},
                                                                      {'$pull': {'files': {'_id': f['_id']}}})
                    if not updated_doc['nModified']:
                        log.error('    couldn\'t remove file from database')
                        exit(1)
示例#16
0
def test_cleanup_deleted_files(data_builder, randstr, file_form, as_admin, api_db, cleanup_deleted, with_site_settings):
    project = data_builder.create_project(providers={'storage': 'deadbeefdeadbeefdeadbeef'})
    subject = data_builder.create_subject(project=project, code='deleted_files_test')
    session_id = data_builder.create_session()

    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1)))

    # get the ticket
    r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ''})
    assert r.ok
    ticket = r.json()['ticket']

    # download the file
    assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ticket}).ok

    # Test that the file won't be deleted if it was deleted in the last 72 hours
    d = datetime.datetime.now() - datetime.timedelta(hours=70)

    api_db['sessions'].find_one_and_update(
        {'files.name': file_name_1},
        {'$set': {'files.$.deleted': d}}
    )

    file_info = api_db['sessions'].find_one(
        {'files.name': file_name_1}
    )['files'][0]
    file_id_1 = file_info['_id']

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    # TODO: we will have to be sure we get the same provider when we move to multi provider support
    storage_service = StorageProviderService()
    storage = storage_service.determine_provider(None, None, force_site_provider=True)
    assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is not None

    # file won't be deleted after 72 hours if the origin is a user
    d = datetime.datetime.now() - datetime.timedelta(hours=73)

    api_db['sessions'].find_one_and_update(
        {'files.name': file_name_1},
        {'$set': {'files.$.deleted': d}}
    )

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is not None

    # file deleted after 72 hours if the origin is not a user
    api_db['sessions'].find_one_and_update(
        {'files.name': file_name_1},
        {'$set': {'files.$.origin.type': 'device'}}
    )

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    # file removed from the filesystem
    assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is None

    # file also removed from the database
    document = api_db['sessions'].find_one(
        {'files.name': file_name_1}
    )

    assert document is None

    # check when the parent container is deleted

    session_id_2 = data_builder.create_session()

    file_name_2 = '%s.csv' % randstr()
    file_content_2 = randstr()
    as_admin.post('/sessions/' + session_id_2 + '/files', files=file_form((file_name_2, file_content_2)))

    file_name_3 = '%s.csv' % randstr()
    file_content_3 = randstr()
    as_admin.post('/sessions/' + session_id_2 + '/files', files=file_form((file_name_3, file_content_3)))

    # Test that the file won't be deleted if it was deleted in the last 72 hours
    d = datetime.datetime.now() - datetime.timedelta(hours=70)

    # Mark session as deleted
    api_db['sessions'].find_one_and_update(
        {'_id': ObjectId(session_id_2)},
        {'$set': {'deleted': d}}
    )

    # Upload two test file
    file_info = api_db['sessions'].find_one(
        {'files.name': file_name_2}
    )['files'][0]
    file_id_2 = file_info['_id']

    file_info = api_db['sessions'].find_one(
        {'files.name': file_name_3}
    )['files'][1]
    file_id_3 = file_info['_id']

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    # files still exist
    assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is not None
    assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None

    # file won't be deleted after 72 hours if the origin is a user
    d = datetime.datetime.now() - datetime.timedelta(hours=73)

    api_db['sessions'].find_one_and_update(
        {'_id': ObjectId(session_id_2)},
        {'$set': {'deleted': d}}
    )

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is not None
    assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None

    # file deleted after 72 hours if the origin is not a user
    api_db['sessions'].find_one_and_update(
        {'files.name': file_name_2},
        {'$set': {'files.$.origin.type': 'device'}}
    )

    cleanup_deleted.main('--log-level', 'DEBUG', '--reaper')

    # first file removed from the filesystem
    assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is None
    # but the second file is still there
    assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None

    # upload a file into the first session to see that it is kept when we use the --all flag
    # but others which are marked to delete will be removed
    file_name_4 = '%s.csv' % randstr()
    file_content_4 = randstr()
    as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_4, file_content_4)))

    file_info = api_db['sessions'].find_one(
        {'files.name': file_name_4}
    )['files'][0]
    file_id_4 = file_info['_id']

    # with --all flag we delete every files which are marked to delete
    # don't care about the origin
    cleanup_deleted.main('--log-level', 'DEBUG', '--all')
    assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is None
    # we keep files which are not marked
    assert storage.storage_plugin.get_file_info(file_id_4, util.path_from_uuid(file_id_4)) is not None

    # Mark the first session as deleted
    api_db['sessions'].find_one_and_update(
        {'_id': ObjectId(session_id)},
        {'$set': {'deleted': d}}
    )

    # now the fourth file will be deleted too
    cleanup_deleted.main('--log-level', 'DEBUG', '--all')
    assert storage.storage_plugin.get_file_info(file_id_4, util.path_from_uuid(file_id_4)) is None
示例#17
0
def test_cleanup_single_project(data_builder, default_payload, randstr, file_form, as_admin, as_drone, api_db, cleanup_deleted, with_site_settings, site_gear):

    # Some tests are leaving partial jobs in the db that kill the tests
    # This is a quick and dirty way to get to a clean state without filtering 
    api_db.jobs.remove({})

    # Projects must have a provider for job/gear uploads to work
    storage_service = StorageProviderService()
    storage = storage_service.determine_provider(None, None, force_site_provider=True)
    project_id = data_builder.create_project(providers={'storage': str(storage.provider_id)})
    session_id = data_builder.create_session()
    acquisition_id = data_builder.create_acquisition()

    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1)))

    file_info = api_db['sessions'].find_one(
        {'files.name': file_name_1}
    )['files'][0]
    file_id_1 = file_info['_id']

    # Create ad-hoc analysis
    r = as_admin.post('/sessions/' + session_id + '/analyses', json={
        'label': 'offline',
        'inputs': [{'type': 'session', 'id': session_id, 'name': file_name_1}]
    })
    assert r.ok
    analysis = r.json()['_id']

    # get the ticket
    r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ''})
    assert r.ok
    ticket = r.json()['ticket']

    # download the file
    assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ticket}).ok

    # run a job
    import bson
    api_db.gears.update({'_id': bson.ObjectId(site_gear)}, {'$set': {'gear.inputs': {'dicom': {'base': 'file'}}}})
    gear = site_gear

    job_data = {
        'gear_id': gear,
        'inputs': {
            'dicom': {
                'type': 'session',
                'id': session_id,
                'name': file_name_1
            }
        },
        'config': { 'two-digit multiple of ten': 20 },
        'destination': {
            'type': 'acquisition',
            'id': acquisition_id
        },
        'tags': [ 'test-tag' ]
    }
    # add job with explicit destination
    r = as_admin.post('/jobs/add', json=job_data)
    assert r.ok
    job_id = r.json()['_id']

    # start job (Adds logs)
    r = as_admin.get('/jobs/next')
    assert r.ok

    # prepare completion (send success status before engine upload)
    r = as_drone.post('/jobs/' + job_id + '/prepare-complete')
    assert r.ok

    # verify that job ticket has been created
    job_ticket = api_db.job_tickets.find_one({'job': job_id})
    assert job_ticket['timestamp']


    produced_metadata = {
        'project': {
            'label': 'engine project',
            'info': {'test': 'p'}
        },
        'session': {
            'label': 'engine session',
            'subject': {'code': 'engine subject', 'sex': 'male', 'age': 86400},
            'info': {'test': 's'}
        },
        'acquisition': {
            'label': 'engine acquisition',
            'timestamp': '2016-06-20T21:57:36+00:00',
            'info': {'test': 'a'},
            'files': [{
                'name': 'result.txt',
                'type': 'text',
                'info': {'test': 'f0'}
            }]
        }
    }

    # engine upload
    r = as_drone.post('/engine',
        params={'level': 'acquisition', 'id': acquisition_id, 'job': job_id, 'job_ticket': job_ticket['_id']},
        files=file_form('result.txt', meta=produced_metadata)
    )
    assert r.ok

    # Make sure produced metadata and logs exist
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert r.json().get('logs')

    # Try cleaning undeleted project
    cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi')

    # Make sure file is still there
    assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1))

    # Make sure job phi is still there
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert r.json().get('logs')

    # delete the project
    r = as_admin.delete('/projects/' + project_id)
    assert r.ok

    # Run cleanup again
    cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi')

    # Make sure file is not there
    assert not storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1))

    # Check job phi
    r = as_admin.get('/jobs/' + job_id)
    assert r.ok
    job = r.json()
    assert not job.get('produced_metadata')

    r = as_admin.get('/jobs/' + job_id + '/logs')
    assert r.ok
    assert not r.json().get('logs')

    assert not api_db.projects.find_one({'_id': ObjectId(project_id)})
    assert not api_db.subjects.find_one({'parents.project': ObjectId(project_id)})
    assert not api_db.sessions.find_one({'parents.project': ObjectId(project_id)})
    assert not api_db.acquisitions.find_one({'parents.project': ObjectId(project_id)})
    assert not api_db.analyses.find_one({'parents.project': ObjectId(project_id)})