def put(self): """Receive a sortable reaper or user upload.""" #if not self.uid and not self.drone_request: # self.abort(402, 'uploads must be from an authorized user or drone') if 'Content-MD5' not in self.request.headers: self.abort(400, 'Request must contain a valid "Content-MD5" header.') filename = self.request.headers.get('Content-Disposition', '').partition('filename=')[2].strip('"') if not filename: self.abort(400, 'Request must contain a valid "Content-Disposition" header.') with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) success, digest, filesize, duration = util.receive_stream_and_validate(self.request.body_file, filepath, self.request.headers['Content-MD5']) if not success: self.abort(400, 'Content-MD5 mismatch.') if not tarfile.is_tarfile(filepath): self.abort(415, 'Only tar files are accepted.') log.info('Received %s [%s] from %s' % (filename, util.hrsize(self.request.content_length), self.request.user_agent)) datainfo = util.parse_file(filepath, digest) if datainfo is None: util.quarantine_file(filepath, self.app.config['quarantine_path']) self.abort(202, 'Quarantining %s (unparsable)' % filename) util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path']) util.create_job(self.app.db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
def sort(args): logging.basicConfig(level=logging.WARNING) quarantine_path = os.path.join(args.sort_path, 'quarantine') if not os.path.exists(args.sort_path): os.makedirs(args.sort_path) if not os.path.exists(quarantine_path): os.makedirs(quarantine_path) print 'initializing DB' kwargs = dict(tz_aware=True) db_client = connect_db(args.db_uri, **kwargs) db = db_client.get_default_database() print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior file_cnt = len(files) print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt for i, filepath in enumerate(files): print 'sorting %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt) hash_ = hashlib.sha1() if not args.quick: with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) datainfo = util.parse_file(filepath, hash_.hexdigest()) if datainfo is None: util.quarantine_file(filepath, quarantine_path) print 'Quarantining %s (unparsable)' % os.path.basename(filepath) else: util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path) util.create_job(db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
def sort(args): logging.basicConfig(level=logging.WARNING) quarantine_path = os.path.join(args.sort_path, 'quarantine') if not os.path.exists(args.sort_path): os.makedirs(args.sort_path) if not os.path.exists(quarantine_path): os.makedirs(quarantine_path) print 'initializing DB' kwargs = dict(tz_aware=True) db_client = connect_db(args.db_uri, **kwargs) db = db_client.get_default_database() print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [ os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.') ]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [ dn for dn in dirnames if not dn.startswith('.') ] # need to use slice assignment to influence walk behavior file_cnt = len(files) print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt for i, filepath in enumerate(files): print 'sorting %s [%s] (%d/%d)' % (os.path.basename( filepath), util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt) hash_ = hashlib.sha1() if not args.quick: with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) datainfo = util.parse_file(filepath, hash_.hexdigest()) if datainfo is None: util.quarantine_file(filepath, quarantine_path) print 'Quarantining %s (unparsable)' % os.path.basename(filepath) else: util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path) util.create_job( db.acquisitions, datainfo ) # FIXME we should only mark files as new and let engine take it from there
def upload(self): """ Recieve a multi-file upload. 3 phases: 1 - upload metadata, obtain upload ticket 2 - upload files, one at a time, but in parallel 3 - send a 'complete' message """ def store_file(fd, filename, md5, arcpath, arcname): with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) success, _, _, _ = util.receive_stream_and_validate(fd, filepath, md5) if not success: self.abort(400, 'Content-MD5 mismatch.') with lockfile.LockFile(arcpath): with tarfile.open(arcpath, 'a') as archive: archive.add(filepath, os.path.join(arcname, filename)) if self.public_request: self.abort(403, 'must be logged in to upload data') filename = self.request.GET.get('filename') ticket_id = self.request.GET.get('ticket') if not ticket_id: if filename != 'METADATA.json': self.abort(400, 'first file must be METADATA.json') try: json_body = self.request.json_body jsonschema.validate(json_body, UPLOAD_SCHEMA) except (ValueError, jsonschema.ValidationError) as e: self.abort(400, str(e)) filetype = json_body['filetype'] overwrites = json_body['overwrite'] query = {'name': overwrites['project_name'], 'group': overwrites['group_name']} project = self.app.db.projects.find_one(query) # verify permissions if not self.superuser_request: user_perm = util.user_perm(project['permissions'], self.uid) if not user_perm: self.abort(403, self.uid + ' does not have permissions on this project') if users.INTEGER_ROLES[user_perm['access']] < users.INTEGER_ROLES['rw']: self.abort(403, self.uid + ' does not have at least ' + min_role + ' permissions on this project') acq_no = overwrites.get('acq_no') arcname = overwrites['series_uid'] + ('_' + str(acq_no) if acq_no is not None else '') + '_' + filetype ticket = util.upload_ticket(arcname=arcname) # store arcname for later reference self.app.db.uploads.insert_one(ticket) arcpath = os.path.join(self.app.config['upload_path'], ticket['_id'] + '.tar') store_file(self.request.body_file, filename, self.request.headers['Content-MD5'], arcpath, arcname) return {'ticket': ticket['_id']} ticket = self.app.db.uploads.find_one({'_id': ticket_id}) if not ticket: self.abort(404, 'no such ticket') arcpath = os.path.join(self.app.config['upload_path'], ticket_id + '.tar') if self.request.GET.get('complete', '').lower() not in ('1', 'true'): if 'Content-MD5' not in self.request.headers: self.app.db.uploads.remove({'_id': ticket_id}) # delete ticket self.abort(400, 'Request must contain a valid "Content-MD5" header.') if not filename: self.app.db.uploads.remove({'_id': ticket_id}) # delete ticket self.abort(400, 'Request must contain a filename query parameter.') self.app.db.uploads.update_one({'_id': ticket_id}, {'$set': {'timestamp': datetime.datetime.utcnow()}}) # refresh ticket store_file(self.request.body_file, filename, self.request.headers['Content-MD5'], arcpath, ticket['arcname']) else: # complete -> zip, hash, commit filepath = arcpath[:-2] + 'gz' with gzip.open(filepath, 'wb', compresslevel=6) as gzfile: with open(arcpath) as rawfile: gzfile.writelines(rawfile) os.remove(arcpath) sha1 = hashlib.sha1() with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): sha1.update(chunk) datainfo = util.parse_file(filepath, sha1.hexdigest()) if datainfo is None: util.quarantine_file(filepath, self.app.config['quarantine_path']) self.abort(202, 'Quarantining %s (unparsable)' % filename) util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path'])
def _put_file(self, _id, container, filename): """Receive a targeted processor or user upload.""" tags = [] metadata = {} if self.request.content_type == 'multipart/form-data': filestream = None # use cgi lib to parse multipart data without loading all into memory; use tempfile instead # FIXME avoid using tempfile; processs incoming stream on the fly fs_environ = self.request.environ.copy() fs_environ.setdefault('CONTENT_LENGTH', '0') fs_environ['QUERY_STRING'] = '' form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True) for fieldname in form: field = form[fieldname] if fieldname == 'file': filestream = field.file filename = field.filename elif fieldname == 'tags': try: tags = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "tags" parameter') elif fieldname == 'metadata': try: metadata = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "metadata" parameter') if filestream is None: self.abort(400, 'multipart/form-data must contain a "file" field') elif filename is None: self.abort(400, 'Request must contain a filename parameter.') else: if 'Content-MD5' not in self.request.headers: self.abort( 400, 'Request must contain a valid "Content-MD5" header.') try: tags = json.loads(self.request.get('tags', '[]')) except ValueError: self.abort(400, 'invalid "tags" parameter') try: metadata = json.loads(self.request.get('metadata', '{}')) except ValueError: self.abort(400, 'invalid "metadata" parameter') filestream = self.request.body_file flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away if flavor not in ['data', 'attachment']: self.abort( 400, 'Query must contain flavor parameter: "data" or "attachment".') with tempfile.TemporaryDirectory( prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) md5 = self.request.headers.get('Content-MD5') success, digest, _, duration = util.receive_stream_and_validate( filestream, filepath, md5) if not success: self.abort(400, 'Content-MD5 mismatch.') filesize = os.path.getsize(filepath) mimetype = util.guess_mimetype(filepath) filetype = util.guess_filetype(filepath, mimetype) datainfo = { 'fileinfo': { 'filename': filename, 'filesize': filesize, 'filehash': digest, 'filetype': filetype, 'flavor': flavor, 'mimetype': mimetype, 'tags': tags, 'metadata': metadata, }, } throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr)) util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])
def _put_file(self, _id, container, filename): """Receive a targeted processor or user upload.""" tags = [] metadata = {} if self.request.content_type == 'multipart/form-data': filestream = None # use cgi lib to parse multipart data without loading all into memory; use tempfile instead # FIXME avoid using tempfile; processs incoming stream on the fly fs_environ = self.request.environ.copy() fs_environ.setdefault('CONTENT_LENGTH', '0') fs_environ['QUERY_STRING'] = '' form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True) for fieldname in form: field = form[fieldname] if fieldname == 'file': filestream = field.file filename = field.filename elif fieldname == 'tags': try: tags = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "tags" parameter') elif fieldname == 'metadata': try: metadata = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "metadata" parameter') if filestream is None: self.abort(400, 'multipart/form-data must contain a "file" field') elif filename is None: self.abort(400, 'Request must contain a filename parameter.') else: if 'Content-MD5' not in self.request.headers: self.abort(400, 'Request must contain a valid "Content-MD5" header.') try: tags = json.loads(self.request.get('tags', '[]')) except ValueError: self.abort(400, 'invalid "tags" parameter') try: metadata = json.loads(self.request.get('metadata', '{}')) except ValueError: self.abort(400, 'invalid "metadata" parameter') filestream = self.request.body_file flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away if flavor not in ['data', 'attachment']: self.abort(400, 'Query must contain flavor parameter: "data" or "attachment".') with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) md5 = self.request.headers.get('Content-MD5') success, digest, _, duration = util.receive_stream_and_validate(filestream, filepath, md5) if not success: self.abort(400, 'Content-MD5 mismatch.') filesize = os.path.getsize(filepath) mimetype = util.guess_mimetype(filepath) filetype = util.guess_filetype(filepath, mimetype) datainfo = { 'fileinfo': { 'filename': filename, 'filesize': filesize, 'filehash': digest, 'filetype': filetype, 'flavor': flavor, 'mimetype': mimetype, 'tags': tags, 'metadata': metadata, }, } throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr)) util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])