def ingest(key): _check_base(request) if app.config['archive'].get('mode', 'read-only') != 'read-write': return 'Archive is read-only', 500 with TemporaryDirectory() as tempdir: # TODO: this probably breaks with very large packages # TODO: implement serialize(...) so this can be done over TAR # TODO: validate checksum # TODO: cleanup on failure for path in request.files: temppath = join(tempdir, valid_path(path)) if not exists(dirname(temppath)): makedirs(dirname(temppath)) with open(temppath, mode='wb') as o: b = None while b != b'': b = request.files[path].read(100 * 1024) o.write(b) key = archive.ingest(Package(tempdir), key=key) package = archive.get(key) return redirect('/%s/' % key, code=201)
def add(self, fname=None, path=None, data=None, url=None, traverse=True, exclude='^\\..*|^_.*', replace=False, type='Resource', **kwargs): if self._mode != 'a': raise Exception('package not writable, open in \'a\' mode') if not (fname or (path and data) or (path and url and type == 'Reference')): raise Exception( 'Specify either path to a filename or path and data)') path = path or basename(abspath(fname)) if path == '_package.json' or path == '_log': raise Exception(f'path ({path}) not allowed') if fname and traverse and isdir(fname): self._add_directory(fname, path, exclude=exclude) else: self._write(valid_path(path), iname=fname, data=data, url=url, replace=replace, type=type, **kwargs) self._reload()
def exists(self, key, path=None): #d = self._directory(valid_key(key)) l = self.location(valid_key(key), path=valid_path(path) if path else None) if l: return exists(l[7:]) #if path: # return exists(join(d, valid_path(path))) #else: # return exists(d) return False
def ingest(self, package, key=None, copy=True): def scrub_ids(x): x = deepcopy(x) x['@id'] = x['path'] if copy and x['@type'] == 'Reference': x['@type'] = 'Resource' if 'url' in x: del x['url'] return x self._check_mode() key = self._generate_key(suggest=valid_key(key) if key else None) with self.lockm.get(key): dir = self._directory(key) makedirs(dir + sep) try: for path in package: if package.get(path)['@type'] != 'Reference' or copy: self._copy(package.get_raw(path), join(dir, valid_path(path))) with open(join(dir, '_package.json'), mode='w') as o: d = deepcopy(package.description()) d['@id'] = '' d['files'] = [scrub_ids(x) for x in d['files']] o.write(dumps(d, indent=2)) # TODO copy meta-files too p = starch.Package(dir, base=self.base + key + '/' if self.base else None) p.validate() except Exception as e: rmtree(dir) raise e self._log_add_package(key) self._callback('ingest', key=key, package=p) return key
def location(self, key, path=None): d = self._directory(valid_key(key)) p = join(d, valid_path(path)) if path else d if exists(p): return 'file://' + p else: package = self.get(key) if package and path in package: f = package.get(path) if f['@type'] == 'Reference' and 'url' in f: u = f['url'] scheme = u[:u.index(':')] if scheme in self.resolutions: u = 'file://' + self.resolutions[scheme] + u[ u.index(':') + 1:] return u return None
def put_file(key, path): _check_base(request) #print(request.args) type = str(request.args.get('type', 'Resource')) if type != 'Reference' and 'expected_hash' not in request.args: #print(type, type == 'Reference', flush=True) return 'parameter expected_hash missing', 400 try: p = archive.get(key, mode='a') if p: path = valid_path(path) url = request.args.get('url', None) expected_hash = request.args.get('expected_hash', None) replace = request.args.get('replace', 'False') == 'True' args = { k: v for k, v in request.args.items() if k not in ['type', 'path', 'replace', 'url', 'expected_hash'] } if url and type == 'Reference': p.add(path=path, url=url, replace=replace, type=type, **args) return 'done', 204 else: if path in request.files: if expected_hash.startswith('MD5:'): h = md5() elif expected_hash.startswith('SHA256:'): h = sha256() else: return 'unsupported hash function \'%s\'' % expected_hash.split( ':')[0], 400 with NamedTemporaryFile() as tempfile: with open(tempfile.name, 'wb') as o: b = None while b != b'': b = request.files[path].read(100 * 1024) o.write(b) h.update(b) h2 = expected_hash.split( ':')[0] + ':' + h.digest().hex() if h2 != expected_hash: return 'expected hash %s, got %s' % (expected_hash, h2), 400 p = archive.get(key, mode='a') p.add(tempfile.name, path=path, replace=replace, type=type, **args) return 'done', 204 else: return 'path (%s) not found in request' % path, 400 else: return 'package not found', 400 except Exception as e: print_exc() print(flush=True) return str(e), 500
def add(self, fname=None, path=None, data=None, url=None, type='Resource', traverse=True, check_version=None, exclude='^\\..*|^_.*', replace=False, **kwargs): if self._mode not in ['w', 'a']: raise Exception('package not writable, open in \'a\' mode') if not (fname or (path and (data is not None or url))): raise Exception('Specify either file or path and (data or url)') if fname and data: raise Exception('Specifying both a file and data is not allowed') if url and url.startswith('file:'): raise Exception('file-URLs not allowed') path = valid_path(path or basename(abspath(fname))) if path in self and not replace: raise Exception(f'file ({path}) already exists, use replace=True') f = {'@id': quote(path), '@type': type, 'path': path} with self._lock_ctx(): if check_version and check_version != self._desc['version']: raise Exception( f'File changed, new version is {self._desc["version"]}') if path in ['_package.json', '_log']: raise Exception(f'filename {path} not allowed') if isinstance(data, dict) or isinstance(data, list): data = dumps(data).encode('utf-8') if fname and isdir(fname): if traverse: self._add_directory(fname, valid_path(path), exclude=exclude) else: raise Exception( 'file {fname} is a directory, set traverse=True to add directories' ) return elif fname or data or url and type != 'Reference': f['urn'] = uuid4().urn if url: f['url'] = url try: oname = join(self.root_dir, path) if not replace and exists(oname): raise Exception( 'file (%s) already exists, use replace=True' % path) if not exists(dirname(oname)): makedirs(dirname(oname)) temppath = join(self.root_dir, f'{path}-tmp-{str(random())}') h = sha256() with open(fname, 'rb') if fname else BytesIO( data) if data else htopen( url, 'rb') as stream, open(temppath, 'wb') as out: data, length = None, 0 while data != b'': data = stream.read(100 * 1024) out.write(data) h.update(data) size = out.tell() except: raise else: f['size'] = size f['checksum'] = 'SHA256:' + h.hexdigest() if path in self and self[path]['checksum'] == f['checksum']: f = self[path] else: with Magic(flags=MAGIC_MIME) as m: f['mime_type'] = m.id_filename(temppath).split( ';')[0] move(temppath, oname) finally: if exists(temppath): remove(temppath) elif url and type == 'Reference': f['url'] = url if url.startswith('http'): try: with get(url, headers={'Accept-Encoding': 'identity'}, stream=True) as r: if 'Content-Length' in r.headers: f['size'] = int(r.headers['Content-Length']) if 'Content-Type' in r.headers: f['mime_type'] = r.headers[ 'Content-Type'].split(';')[0] except: self._log(f'WARN could not probe URL ({url}') else: raise Exception('Incompatible parameters') if type in ['Meta', 'Structure', 'Content']: if 'see_also' not in self._desc: self._desc['see_also'] = [] if path not in self._desc['see_also']: self._desc['see_also'] += [path] f.update(kwargs) self._desc['files'][path] = f if type != 'Reference': self._log( f'STORE "{f["urn"]} {path} size:{f["size"]}{(" " + f["mime_type"]) if "mime_type" in f else ""} {f["checksum"]}' ) else: self._log( f'REF {url} {path}{(" size:" + str(f["size"])) if "size" in f else ""}{(" " + f["mime_type"]) if "mime_type" in f else ""}' ) self.save()