def placeholders_for_collections(): log.info('Create placeholders for colelctions') COLLECTIONS_PREFIXES = [('projects', 'files'), ('acquisitions', 'files'), ('analyses', 'files'), ('sessions', 'files'), ('sessions', 'subject.files'), ('collections', 'files')] _files = [] for collection, prefix in COLLECTIONS_PREFIXES: cursor = config.db.get_collection(collection).find({}) for document in cursor: for f in get_files_by_prefix(document, prefix): f_dict = { 'collection_id': document.get('_id'), 'collection': collection, 'fileinfo': f, 'prefix': prefix } _files.append(f_dict) base = config.get_item('persistent', 'data_path') for i, f in enumerate(_files): f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) create_placeholder_file(f_path, f['fileinfo']['size']) # Show progress if i % (len(_files) / 10 + 1) == 0: log.info('Processed %s files of total %s files ...' % (i, len(_files)))
def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form): """Yield legacy CAS file""" project = data_builder.create_project() file_name = '%s.csv' % randstr() file_content = randstr() as_admin.post('/projects/' + project + '/files', files=file_form((file_name, file_content))) file_info = api_db['projects'].find_one({'files.name': file_name})['files'][0] file_id = file_info['_id'] file_hash = file_info['hash'] # verify cas backward compatibility api_db['projects'].find_one_and_update({'files.name': file_name}, {'$unset': { 'files.$._id': '' }}) file_path = unicode(util.path_from_hash(file_hash)) target_dir = fs.path.dirname(file_path) if not config.local_fs.exists(target_dir): config.local_fs.makedirs(target_dir) fs.move.move_file(src_fs=config.fs, src_path=util.path_from_uuid(file_id), dst_fs=config.local_fs, dst_path=file_path) yield (project, file_name, file_content) # clean up config.local_fs.remove(file_path) config.local_fs.removetree(target_dir) api_db['projects'].delete_one({'_id': project})
def data(args): log.info('inspecting %s' % args.path) files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [ os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.') ]: if not os.path.islink(filepath) and filepath.endswith('.zip'): files.append(filepath) dirnames[:] = [ dn for dn in dirnames if not dn.startswith('.') ] # need to use slice assignment to influence walk behavior file_cnt = len(files) log.info('found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt) for i, filepath in enumerate(files): log.info('Loading %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt)) hash_ = hashlib.sha384() size = os.path.getsize(filepath) try: metadata = json.loads(zipfile.ZipFile(filepath).comment) except ValueError as e: log.warning(str(e)) continue container = reaperutil.create_container_hierarchy(metadata) with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) computed_hash = 'v0-sha384-' + hash_.hexdigest() destpath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(computed_hash)) dir_destpath = os.path.dirname(destpath) filename = os.path.basename(filepath) if not os.path.exists(dir_destpath): os.makedirs(dir_destpath) if args.copy: shutil.copyfile(filepath, destpath) else: shutil.move(filepath, destpath) created = modified = datetime.datetime.utcnow() fileinfo = { 'name': filename, 'size': size, 'hash': computed_hash, 'type': 'dicom', # we are only bootstrapping dicoms at the moment 'created': created, 'modified': modified } container.add_file(fileinfo) rules.create_jobs(config.db, container.acquisition, 'acquisition', fileinfo)
def migrate_file(f): file_id = f['fileinfo'].get('_id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.get_file_info(file_id, file_path): log.debug( ' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) old_file = src_fs.open(file_id, file_path, 'rb') new_file = target_fs.open(file_id, file_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) log.debug(' generated uuid: %s', file_id) f_old_path = util.path_from_hash(f['fileinfo']['hash']) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') old_file = local_fs.open(None, f_old_path, 'rb') new_file = target_fs.open(file_id, f_new_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() update_set = { f['prefix'] + '.$.modified': datetime.datetime.utcnow(), f['prefix'] + '.$._id': file_id } # Update the file with the newly generated UUID updated_doc = db[f['container']].find_one_and_update( { '_id': f['container_id'], f['prefix'] + '.name': f['fileinfo']['name'], f['prefix'] + '.hash': f['fileinfo']['hash'] }, {'$set': update_set}) if not updated_doc: log.info( 'Probably the following file has been updated during the migration ' 'and its hash is changed, cleaning up from the new filesystem') target_fs.remove_file(file_id, f_new_path)
def data(args): log.info('inspecting %s' % args.path) files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]: if not os.path.islink(filepath) and filepath.endswith('.zip'): files.append(filepath) dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior file_cnt = len(files) log.info('found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt) for i, filepath in enumerate(files): log.info('Loading %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt)) hash_ = hashlib.sha384() size = os.path.getsize(filepath) try: metadata = json.loads(zipfile.ZipFile(filepath).comment) except ValueError as e: log.warning(str(e)) continue container = reaperutil.create_container_hierarchy(metadata) with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) computed_hash = 'v0-sha384-' + hash_.hexdigest() destpath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(computed_hash)) dir_destpath = os.path.dirname(destpath) filename = os.path.basename(filepath) if not os.path.exists(dir_destpath): os.makedirs(dir_destpath) if args.copy: shutil.copyfile(filepath, destpath) else: shutil.move(filepath, destpath) created = modified = datetime.datetime.utcnow() fileinfo = { 'name': filename, 'size': size, 'hash': computed_hash, 'type': 'dicom', # we are only bootstrapping dicoms at the moment 'created': created, 'modified': modified } container.add_file(fileinfo) rules.create_jobs(config.db, container.acquisition, 'acquisition', fileinfo)
def migrate_file(f): file_id = f['fileinfo'].get('_id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.isfile(file_path): log.debug(' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) dst_dir = fs.path.dirname(file_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path) else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) log.debug(' generated uuid: %s', file_id) f_old_path = util.path_from_hash(f['fileinfo']['hash']) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') dst_dir = fs.path.dirname(f_new_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path) update_set = { f['prefix'] + '.$.modified': datetime.datetime.utcnow(), f['prefix'] + '.$._id': file_id } # Update the file with the newly generated UUID updated_doc = db[f['container']].find_one_and_update( {'_id': f['container_id'], f['prefix'] + '.name': f['fileinfo']['name'], f['prefix'] + '.hash': f['fileinfo']['hash']}, {'$set': update_set} ) if not updated_doc: log.info('Probably the following file has been updated during the migration ' 'and its hash is changed, cleaning up from the new filesystem') target_fs.remove(f_new_path)
def migrate_gear_files(f): file_id = f['exchange'].get('rootfs-id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.get_file_info(file_id, file_path): log.debug( ' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) old_file = src_fs.open(file_id, file_path, 'rb') new_file = target_fs.open(file_id, file_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') f_old_path = util.path_from_hash(file_hash) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') old_file = local_fs.open(None, f_old_path, 'rb') new_file = target_fs.open(file_id, f_new_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() update_set = { 'modified': datetime.datetime.utcnow(), 'exchange.rootfs-id': file_id } # Update the gear with the newly generated UUID db['gears'].find_one_and_update({'_id': f['gear_id']}, {'$set': update_set})
def placeholders_for_gears(): log.info('Create placeholders for gears') cursor = config.db.get_collection('gears').find({}) _files = [] for document in cursor: if document['exchange']['git-commit'] == 'local': f_dict = { 'gear_id': document['_id'], 'gear_name': document['gear']['name'], 'exchange': document['exchange'] } _files.append(f_dict) base = config.get_item('persistent', 'data_path') for i, f in enumerate(_files): f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') f_path = os.path.join(base, util.path_from_hash(f_hash)) create_placeholder_file(f_path, f['gear_name']) # Show progress if i % (len(_files) / 10 + 1) == 0: log.info('Processed %s gear files of total %s files ...' % (i, len(_files)))
def migrate_gear_files(f): file_id = f['exchange'].get('rootfs-id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.isfile(file_path): log.debug(' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) dst_dir = fs.path.dirname(file_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path) else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') f_old_path = util.path_from_hash(file_hash) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') dst_dir = fs.path.dirname(f_new_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path) update_set = { 'modified': datetime.datetime.utcnow(), 'exchange.rootfs-id': file_id } # Update the gear with the newly generated UUID db['gears'].find_one_and_update( {'_id': f['gear_id']}, {'$set': update_set} )
def gears_to_migrate(api_db, as_admin, randstr, file_form): def gen_gear_meta(gear_name): return { 'gear': { "version": '0.0.1', "config": {}, "name": gear_name, "inputs": { "file": { "base": "file", "description": "Any image." } }, "maintainer": "Test", "description": "Test", "license": "Other", "author": "Test", "url": "http://example.example", "label": "Test Gear", "flywheel": "0", "source": "http://example.example" } } gears = [] gear_name_1 = randstr() file_name = '%s.tar.gz' % randstr() file_content = randstr() r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_1))) gear_id_1 = r.json()['_id'] r = as_admin.get('/gears/' + gear_id_1) gear_json_1 = r.json() file_hash__1 = 'v0-' + gear_json_1['exchange']['rootfs-hash'].replace( ':', '-') file_id_1 = gear_json_1['exchange']['rootfs-id'] file_path = unicode(util.path_from_hash(file_hash__1)) target_dir = fs.path.dirname(file_path) if not config.local_fs.get_fs().exists(target_dir): config.local_fs.get_fs().makedirs(target_dir) move_file(file_id_1, config.local_fs, file_path) api_db['gears'].find_one_and_update({'_id': ObjectId(gear_id_1)}, {'$unset': { 'exchange.rootfs-id': '' }}) gears.append((gear_id_1, file_path)) gear_name_2 = randstr() file_name = '%s.tar.gz' % randstr() file_content = randstr() r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_2))) gear_id_2 = r.json()['_id'] r = as_admin.get('/gears/' + gear_id_2) gear_json_2 = r.json() file_id_2 = gear_json_2['exchange']['rootfs-id'] file_path = unicode(util.path_from_uuid(file_id_2)) target_dir = fs.path.dirname(file_path) if not config.local_fs.get_fs().exists(target_dir): config.local_fs._fs.makedirs(target_dir) move_file(file_id_2, config.local_fs, file_path) gears.append((gear_id_2, file_path)) yield gears # clean up gear_json_1 = api_db['gears'].find_one({'_id': ObjectId(gear_id_1)}) gear_json_2 = api_db['gears'].find_one({'_id': ObjectId(gear_id_2)}) files_to_delete = [] files_to_delete.append( util.path_from_uuid(gear_json_1['exchange'].get('rootfs-id', ''))) files_to_delete.append( util.path_from_uuid(gear_json_1['exchange'].get('rootfs-hash', ''))) files_to_delete.append( util.path_from_uuid(gear_json_2['exchange'].get('rootfs-id', ''))) for f_path in files_to_delete: try: config.primary_storage.remove_file(None, f_path) except: pass api_db['gears'].delete_one({'_id': ObjectId(gear_id_1)}) api_db['gears'].delete_one({'_id': ObjectId(gear_id_2)})
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): # Create a project session_id = data_builder.create_session() files = [] # Create a CAS file file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) file_info = api_db['sessions'].find_one({'files.name': file_name_1})['files'][0] file_id_1 = file_info['_id'] file_hash_1 = file_info['hash'] url_1 = '/sessions/' + session_id + '/files/' + file_name_1 api_db['sessions'].find_one_and_update({'files.name': file_name_1}, {'$unset': { 'files.$._id': '' }}) move_file_to_legacy(file_id_1, util.path_from_hash(file_hash_1)) files.append( (session_id, file_name_1, url_1, util.path_from_hash(file_hash_1))) # Create an UUID file file_name_2 = '%s.csv' % randstr() file_content_2 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_2, file_content_2))) file_info = api_db['sessions'].find_one({'files.name': file_name_2})['files'][1] file_id_2 = file_info['_id'] url_2 = '/sessions/' + session_id + '/files/' + file_name_2 move_file_to_legacy(file_id_2, util.path_from_uuid(file_id_2)) files.append( (session_id, file_name_2, url_2, util.path_from_uuid(file_id_2))) ### Temp fix for 3-way split storages, see api.config.local_fs2 for details # Create an UUID file in legacy/v1 for testing 3-way split storage file_name_3 = '%s.csv' % randstr() file_content_3 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_3, file_content_3))) file_info = api_db['sessions'].find_one({'files.name': file_name_3})['files'][2] file_id_3 = file_info['_id'] url_3 = '/sessions/' + session_id + '/files/' + file_name_3 move_file_to_legacy2(file_id_3, util.path_from_uuid(file_id_3)) files.append( (session_id, file_name_3, url_3, util.path_from_uuid(file_id_3))) ### yield files # Clean up, get the files files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files'] # Delete the files for f in files: try: config.primary_storage.remove_file(f['_id'], util.path_from_uuid(f['_id'])) except: pass