Exemplo n.º 1
0
def placeholders_for_collections():
    log.info('Create placeholders for colelctions')
    COLLECTIONS_PREFIXES = [('projects', 'files'),
                            ('acquisitions', 'files'),
                            ('analyses', 'files'),
                            ('sessions', 'files'),
                            ('sessions', 'subject.files'),
                            ('collections', 'files')]

    _files = []

    for collection, prefix in COLLECTIONS_PREFIXES:
        cursor = config.db.get_collection(collection).find({})
        for document in cursor:
            for f in get_files_by_prefix(document, prefix):
                f_dict = {
                    'collection_id': document.get('_id'),
                    'collection': collection,
                    'fileinfo': f,
                    'prefix': prefix
                }
                _files.append(f_dict)

    base = config.get_item('persistent', 'data_path')
    for i, f in enumerate(_files):
        f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash']))
        create_placeholder_file(f_path, f['fileinfo']['size'])

        # Show progress
        if i % (len(_files) / 10 + 1) == 0:
            log.info('Processed %s files of total %s files ...' % (i, len(_files)))
Exemplo n.º 2
0
def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form):
    """Yield legacy CAS file"""
    project = data_builder.create_project()
    file_name = '%s.csv' % randstr()
    file_content = randstr()
    as_admin.post('/projects/' + project + '/files',
                  files=file_form((file_name, file_content)))

    file_info = api_db['projects'].find_one({'files.name':
                                             file_name})['files'][0]
    file_id = file_info['_id']
    file_hash = file_info['hash']
    # verify cas backward compatibility
    api_db['projects'].find_one_and_update({'files.name': file_name},
                                           {'$unset': {
                                               'files.$._id': ''
                                           }})

    file_path = unicode(util.path_from_hash(file_hash))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.exists(target_dir):
        config.local_fs.makedirs(target_dir)
    fs.move.move_file(src_fs=config.fs,
                      src_path=util.path_from_uuid(file_id),
                      dst_fs=config.local_fs,
                      dst_path=file_path)

    yield (project, file_name, file_content)

    # clean up
    config.local_fs.remove(file_path)
    config.local_fs.removetree(target_dir)
    api_db['projects'].delete_one({'_id': project})
Exemplo n.º 3
0
def data(args):
    log.info('inspecting %s' % args.path)
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [
                os.path.join(dirpath, fn) for fn in filenames
                if not fn.startswith('.')
        ]:
            if not os.path.islink(filepath) and filepath.endswith('.zip'):
                files.append(filepath)
        dirnames[:] = [
            dn for dn in dirnames if not dn.startswith('.')
        ]  # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    log.info('found %d files to sort (ignoring symlinks and dotfiles)' %
             file_cnt)
    for i, filepath in enumerate(files):
        log.info('Loading     %s [%s] (%d/%d)' %
                 (os.path.basename(filepath),
                  util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt))
        hash_ = hashlib.sha384()
        size = os.path.getsize(filepath)
        try:
            metadata = json.loads(zipfile.ZipFile(filepath).comment)
        except ValueError as e:
            log.warning(str(e))
            continue
        container = reaperutil.create_container_hierarchy(metadata)
        with open(filepath, 'rb') as fd:
            for chunk in iter(lambda: fd.read(2**20), ''):
                hash_.update(chunk)
        computed_hash = 'v0-sha384-' + hash_.hexdigest()
        destpath = os.path.join(config.get_item('persistent', 'data_path'),
                                util.path_from_hash(computed_hash))
        dir_destpath = os.path.dirname(destpath)
        filename = os.path.basename(filepath)
        if not os.path.exists(dir_destpath):
            os.makedirs(dir_destpath)
        if args.copy:
            shutil.copyfile(filepath, destpath)
        else:
            shutil.move(filepath, destpath)
        created = modified = datetime.datetime.utcnow()
        fileinfo = {
            'name': filename,
            'size': size,
            'hash': computed_hash,
            'type': 'dicom',  # we are only bootstrapping dicoms at the moment
            'created': created,
            'modified': modified
        }
        container.add_file(fileinfo)
        rules.create_jobs(config.db, container.acquisition, 'acquisition',
                          fileinfo)
Exemplo n.º 4
0
def migrate_file(f):
    file_id = f['fileinfo'].get('_id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.get_file_info(file_id, file_path):
            log.debug(
                '    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            old_file = src_fs.open(file_id, file_path, 'rb')
            new_file = target_fs.open(file_id, file_path, 'wb')
            buffer_copy(old_file, new_file, CHUNK_SIZE)
            old_file.close()
            new_file.close()
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        log.debug('    generated uuid: %s', file_id)
        f_old_path = util.path_from_hash(f['fileinfo']['hash'])
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')
        old_file = local_fs.open(None, f_old_path, 'rb')
        new_file = target_fs.open(file_id, f_new_path, 'wb')
        buffer_copy(old_file, new_file, CHUNK_SIZE)
        old_file.close()
        new_file.close()

        update_set = {
            f['prefix'] + '.$.modified': datetime.datetime.utcnow(),
            f['prefix'] + '.$._id': file_id
        }

        # Update the file with the newly generated UUID
        updated_doc = db[f['container']].find_one_and_update(
            {
                '_id': f['container_id'],
                f['prefix'] + '.name': f['fileinfo']['name'],
                f['prefix'] + '.hash': f['fileinfo']['hash']
            }, {'$set': update_set})

        if not updated_doc:
            log.info(
                'Probably the following file has been updated during the migration '
                'and its hash is changed, cleaning up from the new filesystem')
            target_fs.remove_file(file_id, f_new_path)
Exemplo n.º 5
0
def data(args):
    log.info('inspecting %s' % args.path)
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]:
            if not os.path.islink(filepath) and filepath.endswith('.zip'):
                files.append(filepath)
        dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    log.info('found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt)
    for i, filepath in enumerate(files):
        log.info('Loading     %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt))
        hash_ = hashlib.sha384()
        size = os.path.getsize(filepath)
        try:
            metadata = json.loads(zipfile.ZipFile(filepath).comment)
        except ValueError as e:
            log.warning(str(e))
            continue
        container = reaperutil.create_container_hierarchy(metadata)
        with open(filepath, 'rb') as fd:
            for chunk in iter(lambda: fd.read(2**20), ''):
                hash_.update(chunk)
        computed_hash = 'v0-sha384-' + hash_.hexdigest()
        destpath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(computed_hash))
        dir_destpath = os.path.dirname(destpath)
        filename = os.path.basename(filepath)
        if not os.path.exists(dir_destpath):
            os.makedirs(dir_destpath)
        if args.copy:
            shutil.copyfile(filepath, destpath)
        else:
            shutil.move(filepath, destpath)
        created = modified = datetime.datetime.utcnow()
        fileinfo = {
            'name': filename,
            'size': size,
            'hash': computed_hash,
            'type': 'dicom', # we are only bootstrapping dicoms at the moment
            'created': created,
            'modified': modified
        }
        container.add_file(fileinfo)
        rules.create_jobs(config.db, container.acquisition, 'acquisition', fileinfo)
Exemplo n.º 6
0
def migrate_file(f):
    file_id = f['fileinfo'].get('_id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.isfile(file_path):
            log.debug('    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            dst_dir = fs.path.dirname(file_path)
            target_fs.makedirs(dst_dir, recreate=True)
            fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path)
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        log.debug('    generated uuid: %s', file_id)
        f_old_path = util.path_from_hash(f['fileinfo']['hash'])
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')
        dst_dir = fs.path.dirname(f_new_path)
        target_fs.makedirs(dst_dir, recreate=True)
        fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path)

        update_set = {
            f['prefix'] + '.$.modified': datetime.datetime.utcnow(),
            f['prefix'] + '.$._id': file_id
        }

        # Update the file with the newly generated UUID
        updated_doc = db[f['container']].find_one_and_update(
            {'_id': f['container_id'],
             f['prefix'] + '.name': f['fileinfo']['name'],
             f['prefix'] + '.hash': f['fileinfo']['hash']},
            {'$set': update_set}
        )

        if not updated_doc:
            log.info('Probably the following file has been updated during the migration '
                     'and its hash is changed, cleaning up from the new filesystem')
            target_fs.remove(f_new_path)
Exemplo n.º 7
0
def migrate_gear_files(f):
    file_id = f['exchange'].get('rootfs-id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.get_file_info(file_id, file_path):
            log.debug(
                '    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            old_file = src_fs.open(file_id, file_path, 'rb')
            new_file = target_fs.open(file_id, file_path, 'wb')
            buffer_copy(old_file, new_file, CHUNK_SIZE)
            old_file.close()
            new_file.close()
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-')
        f_old_path = util.path_from_hash(file_hash)
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')

        old_file = local_fs.open(None, f_old_path, 'rb')
        new_file = target_fs.open(file_id, f_new_path, 'wb')
        buffer_copy(old_file, new_file, CHUNK_SIZE)
        old_file.close()
        new_file.close()

        update_set = {
            'modified': datetime.datetime.utcnow(),
            'exchange.rootfs-id': file_id
        }

        # Update the gear with the newly generated UUID
        db['gears'].find_one_and_update({'_id': f['gear_id']},
                                        {'$set': update_set})
Exemplo n.º 8
0
def placeholders_for_gears():
    log.info('Create placeholders for gears')
    cursor = config.db.get_collection('gears').find({})
    _files = []
    for document in cursor:
        if document['exchange']['git-commit'] == 'local':
            f_dict = {
                'gear_id': document['_id'],
                'gear_name': document['gear']['name'],
                'exchange': document['exchange']
            }
            _files.append(f_dict)

    base = config.get_item('persistent', 'data_path')
    for i, f in enumerate(_files):
        f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-')
        f_path = os.path.join(base, util.path_from_hash(f_hash))
        create_placeholder_file(f_path, f['gear_name'])

        # Show progress
        if i % (len(_files) / 10 + 1) == 0:
            log.info('Processed %s gear files of total %s files ...' % (i, len(_files)))
Exemplo n.º 9
0
def migrate_gear_files(f):
    file_id = f['exchange'].get('rootfs-id', '')
    if file_id:
        file_path = util.path_from_uuid(file_id)
        if not target_fs.isfile(file_path):
            log.debug('    file aready has id field, just copy to target storage')
            src_fs = get_src_fs_by_file_path(file_path)
            log.debug('    file found in %s' % src_fs)

            dst_dir = fs.path.dirname(file_path)
            target_fs.makedirs(dst_dir, recreate=True)
            fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path)
        else:
            log.debug('    file is aready present in target storage, skipping')
    else:
        file_id = str(uuid.uuid4())
        file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-')
        f_old_path = util.path_from_hash(file_hash)
        log.debug('    file old path: %s', f_old_path)
        f_new_path = util.path_from_uuid(file_id)
        log.debug('    file new path: %s', f_new_path)

        log.debug('    copy file to target storage')

        dst_dir = fs.path.dirname(f_new_path)
        target_fs.makedirs(dst_dir, recreate=True)
        fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path)

        update_set = {
            'modified': datetime.datetime.utcnow(),
            'exchange.rootfs-id': file_id
        }

        # Update the gear with the newly generated UUID
        db['gears'].find_one_and_update(
            {'_id': f['gear_id']},
            {'$set': update_set}
        )
Exemplo n.º 10
0
def gears_to_migrate(api_db, as_admin, randstr, file_form):
    def gen_gear_meta(gear_name):
        return {
            'gear': {
                "version": '0.0.1',
                "config": {},
                "name": gear_name,
                "inputs": {
                    "file": {
                        "base": "file",
                        "description": "Any image."
                    }
                },
                "maintainer": "Test",
                "description": "Test",
                "license": "Other",
                "author": "Test",
                "url": "http://example.example",
                "label": "Test Gear",
                "flywheel": "0",
                "source": "http://example.example"
            }
        }

    gears = []

    gear_name_1 = randstr()

    file_name = '%s.tar.gz' % randstr()
    file_content = randstr()
    r = as_admin.post('/gears/temp',
                      files=file_form((file_name, file_content),
                                      meta=gen_gear_meta(gear_name_1)))
    gear_id_1 = r.json()['_id']

    r = as_admin.get('/gears/' + gear_id_1)
    gear_json_1 = r.json()

    file_hash__1 = 'v0-' + gear_json_1['exchange']['rootfs-hash'].replace(
        ':', '-')
    file_id_1 = gear_json_1['exchange']['rootfs-id']

    file_path = unicode(util.path_from_hash(file_hash__1))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.get_fs().exists(target_dir):
        config.local_fs.get_fs().makedirs(target_dir)
    move_file(file_id_1, config.local_fs, file_path)

    api_db['gears'].find_one_and_update({'_id': ObjectId(gear_id_1)},
                                        {'$unset': {
                                            'exchange.rootfs-id': ''
                                        }})

    gears.append((gear_id_1, file_path))

    gear_name_2 = randstr()
    file_name = '%s.tar.gz' % randstr()
    file_content = randstr()
    r = as_admin.post('/gears/temp',
                      files=file_form((file_name, file_content),
                                      meta=gen_gear_meta(gear_name_2)))
    gear_id_2 = r.json()['_id']

    r = as_admin.get('/gears/' + gear_id_2)
    gear_json_2 = r.json()

    file_id_2 = gear_json_2['exchange']['rootfs-id']

    file_path = unicode(util.path_from_uuid(file_id_2))
    target_dir = fs.path.dirname(file_path)
    if not config.local_fs.get_fs().exists(target_dir):
        config.local_fs._fs.makedirs(target_dir)
    move_file(file_id_2, config.local_fs, file_path)
    gears.append((gear_id_2, file_path))

    yield gears

    # clean up
    gear_json_1 = api_db['gears'].find_one({'_id': ObjectId(gear_id_1)})
    gear_json_2 = api_db['gears'].find_one({'_id': ObjectId(gear_id_2)})
    files_to_delete = []
    files_to_delete.append(
        util.path_from_uuid(gear_json_1['exchange'].get('rootfs-id', '')))
    files_to_delete.append(
        util.path_from_uuid(gear_json_1['exchange'].get('rootfs-hash', '')))
    files_to_delete.append(
        util.path_from_uuid(gear_json_2['exchange'].get('rootfs-id', '')))

    for f_path in files_to_delete:
        try:
            config.primary_storage.remove_file(None, f_path)
        except:
            pass

    api_db['gears'].delete_one({'_id': ObjectId(gear_id_1)})
    api_db['gears'].delete_one({'_id': ObjectId(gear_id_2)})
Exemplo n.º 11
0
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form):
    # Create a project
    session_id = data_builder.create_session()

    files = []

    # Create a CAS file
    file_name_1 = '%s.csv' % randstr()
    file_content_1 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_1, file_content_1)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_1})['files'][0]
    file_id_1 = file_info['_id']
    file_hash_1 = file_info['hash']
    url_1 = '/sessions/' + session_id + '/files/' + file_name_1

    api_db['sessions'].find_one_and_update({'files.name': file_name_1},
                                           {'$unset': {
                                               'files.$._id': ''
                                           }})

    move_file_to_legacy(file_id_1, util.path_from_hash(file_hash_1))
    files.append(
        (session_id, file_name_1, url_1, util.path_from_hash(file_hash_1)))

    # Create an UUID file
    file_name_2 = '%s.csv' % randstr()
    file_content_2 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_2, file_content_2)))

    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_2})['files'][1]
    file_id_2 = file_info['_id']
    url_2 = '/sessions/' + session_id + '/files/' + file_name_2

    move_file_to_legacy(file_id_2, util.path_from_uuid(file_id_2))
    files.append(
        (session_id, file_name_2, url_2, util.path_from_uuid(file_id_2)))

    ### Temp fix for 3-way split storages, see api.config.local_fs2 for details
    # Create an UUID file in legacy/v1 for testing 3-way split storage
    file_name_3 = '%s.csv' % randstr()
    file_content_3 = randstr()
    as_admin.post('/sessions/' + session_id + '/files',
                  files=file_form((file_name_3, file_content_3)))
    file_info = api_db['sessions'].find_one({'files.name':
                                             file_name_3})['files'][2]
    file_id_3 = file_info['_id']
    url_3 = '/sessions/' + session_id + '/files/' + file_name_3

    move_file_to_legacy2(file_id_3, util.path_from_uuid(file_id_3))
    files.append(
        (session_id, file_name_3, url_3, util.path_from_uuid(file_id_3)))
    ###

    yield files

    # Clean up, get the files
    files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files']
    # Delete the files
    for f in files:
        try:
            config.primary_storage.remove_file(f['_id'],
                                               util.path_from_uuid(f['_id']))
        except:
            pass