def get_file_hash(self, uuid, path_hint): hash_alg = self._default_hash_alg hasher = hashlib.new(hash_alg) if path_hint: filepath = path_hint else: filepath = path_from_uuid(uuid) if not isinstance(filepath, unicode): filepath = six.u(filepath) with self._fs.open(filepath, 'rb') as f: while True: data = f.read(self._buffer_size) if not data: break hasher.update(data) return format_hash(hash_alg, hasher.hexdigest())
def hash(self): """Return the formatted hash of the file""" return storage.format_hash(self.hash_alg, self.hasher.hexdigest())
def test_py_fs_storage(): pyfs = storage.create_flywheel_fs(type_='osfs', config={'path': '/tmp'}) assert pyfs._fs is not None assert pyfs.is_signed_url() == False f = pyfs.open(None, u'test.txt', 'w') assert f is not None f.write(u'This is a test') f.close() f = pyfs.open(None, u'test.txt', 'r') d = f.read() assert d == 'This is a test' d = f.close() f = pyfs.open(None, u'test.txt', 'w') assert f is not None f.write(u'Overwrite an existing file') f.close() f = pyfs.open(None, u'test.txt', 'r') d = f.read() assert d == 'Overwrite an existing file' d = f.close() f = pyfs.open(None, u'newdir/test.txt', 'w') assert f is not None f.write(u'Test in a new directory') f.close() f = pyfs.open(None, u'newdir/test.txt', 'r') d = f.read() assert d == 'Test in a new directory' d = f.close() f = pyfs.open(None, u'newdir/test2.txt', 'w') assert f is not None f.write(u'Test in an existing directory') f.close() f = pyfs.open(None, u'newdir/test2.txt', 'r') d = f.read() assert d == 'Test in an existing directory' d = f.close() f = pyfs.open(None, u'newdir/nested/test.txt', 'w') assert f is not None f.write(u'Test in a new nested directory') f.close() f = pyfs.open(None, u'newdir/nested/test.txt', 'r') d = f.read() assert d == 'Test in a new nested directory' d = f.close() f = pyfs.open(None, u'new_nested/nested/test.txt', 'w') assert f is not None f.write(u'Test in a new deeply nested directory') f.close() f = pyfs.open(None, u'new_nested/nested/test.txt', 'r') d = f.read() assert d == 'Test in a new deeply nested directory' d = f.close() # Test filesize data = pyfs.get_file_info(None, u'test.txt') assert 'filesize' in data # Test hashing of uploaded files. hash_alg = pyfs._default_hash_alg hasher = hashlib.new(hash_alg) hasher.update(u'Test in a new deeply nested directory') hash_val = hasher.hexdigest() hash_val = storage.format_hash(hash_alg, hash_val) assert hash_val == pyfs.get_file_hash(None, u'new_nested/nested/test.txt')
def process_upload(request, strategy, access_logger, container_type=None, id_=None, origin=None, context=None, response=None, metadata=None, file_fields=None, tempdir=None): """ Universal file upload entrypoint. Format: Multipart form upload with N file fields, each with their desired filename. For technical reasons, no form field names can be repeated. Instead, use (file1, file2) and so forth. Depending on the type of upload, a non-file form field called "metadata" may/must also be sent. If present, it is expected to be a JSON string matching the schema for the upload strategy. Currently, the JSON returned may vary by strategy. Some examples: curl -F [email protected] -F [email protected] url curl -F metadata=<stuff.json -F [email protected] url http --form POST url [email protected] [email protected] Features: | targeted | reaper | engine | packfile Must specify a target container | X | | X | May create hierarchy on demand | | X | | X May send metadata about the files | X | X | X | X MUST send metadata about the files | | X | | X Creates a packfile from uploaded files | | | | X """ log = request.logger if not isinstance(strategy, Strategy): raise Exception('Unknown upload strategy') if id_ is not None and container_type == None: raise Exception('Unspecified container type') allowed_container_types = ('project', 'subject', 'session', 'acquisition', 'gear', 'analysis', 'collection') if container_type is not None and container_type not in allowed_container_types: raise Exception('Unknown container type') timestamp = datetime.datetime.utcnow() container = None if container_type and id_: container = hierarchy.get_container(container_type, id_) # Check if filename should be basename or full path filename_path = request.GET.get('filename_path', '').lower() in ('1', 'true') if filename_path: name_fn = util.sanitize_path else: name_fn = os.path.basename # The vast majority of this function's wall-clock time is spent here. file_processor = files.FileProcessor(config.primary_storage) if not file_fields: # The only time we need the tempdir_name is when we use token and packfile. form = file_processor.process_form(request, use_filepath=filename_path, tempdir_name=tempdir) # Non-file form fields may have an empty string as filename, check for 'falsy' values file_fields = extract_file_fields(form) if 'metadata' in form: try: metadata = json.loads(form['metadata'].value) except Exception: raise FileFormException('wrong format for field "metadata"') if isinstance(metadata, dict): for f in metadata.get(container_type, {}).get('files', []): f['name'] = name_fn(f['name']) elif isinstance(metadata, list): for f in metadata: f['name'] = name_fn(f['name']) placer_class = strategy.value placer = placer_class(container_type, container, id_, metadata, timestamp, origin, context, access_logger, logger=log) placer.check() # Browsers, when sending a multipart upload, will send files with field name "file" (if sinuglar) # or "file1", "file2", etc (if multiple). Following this convention is probably a good idea. # Here, we accept any # TODO: Change schemas to enabled targeted uploads of more than one file. # Ref docs from placer.TargetedPlacer for details. if strategy == Strategy.targeted and len(file_fields) > 1: raise FileFormException("Targeted uploads can only send one file") for field in file_fields: if hasattr(field, 'file'): field.file.close() field.hash = storage.format_hash(files.DEFAULT_HASH_ALG, field.hasher.hexdigest()) if not hasattr(field, 'hash'): field.hash = '' # Augment the cgi.FieldStorage with a variety of custom fields. # Not the best practice. Open to improvements. # These are presumbed to be required by every function later called with field as a parameter. #We can trust the filepath on upload is accurate after form processing if hasattr(field, 'filepath'): #Some placers need this value. Consistent object would be nice field.path = field.filepath if tempdir: field.size = (config.local_fs.get_file_info( None, field.filepath))['filesize'] else: field.size = (config.primary_storage.get_file_info( field.uuid, util.path_from_uuid(field.uuid)))['filesize'] field.mimetype = util.guess_mimetype( field.filename) # TODO: does not honor metadata's mime type if any field.modified = timestamp # create a file-attribute map commonly used elsewhere in the codebase. # Stands in for a dedicated object... for now. file_attrs = make_file_attrs(field, origin) placer.process_file_field(file_attrs) # Respond either with Server-Sent Events or a standard json map if placer.sse and not response: raise Exception("Programmer error: response required") elif placer.sse: # Returning a callable will bypass webapp2 processing and allow # full control over the response. def sse_handler(environ, start_response): # pylint: disable=unused-argument write = start_response( '200 OK', [('Content-Type', 'text/event-stream; charset=utf-8'), ('Connection', 'keep-alive')]) # Instead of handing the iterator off to response.app_iter, send it ourselves. # This prevents disconnections from leaving the API in a partially-complete state. # # Timing out between events or throwing an exception will result in undefinied behaviour. # Right now, in our environment: # - Timeouts may result in nginx-created 500 Bad Gateway HTML being added to the response. # - Exceptions add some error json to the response, which is not SSE-sanitized. for item in placer.finalize(): try: write(item) except Exception: # pylint: disable=broad-except log.info('SSE upload progress failed to send; continuing') return '' return sse_handler else: return placer.finalize()