def lambda_handler(request): """ dynamically handle preview requests for bytes in S3 caller must specify input_type (since there may be no file extension) Returns: JSON response """ url = request.args['url'] input_type = request.args.get('input') compression = request.args.get('compression') parsed_url = urlparse(url, allow_fragments=False) if not (parsed_url.scheme == 'https' and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and parsed_url.username is None and parsed_url.password is None): return make_json_response( 400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'}) try: line_count = _str_to_line_count( request.args.get('line_count', str(MAX_LINES))) except ValueError as error: # format https://jsonapi.org/format/1.1/#error-objects return make_json_response(400, { 'title': f'Unexpected line_count= value', 'detail': str(error) }) # stream=True saves memory almost equal to file size resp = requests.get(url, stream=True) if resp.ok: if input_type == 'excel': html, info = extract_excel(_to_memory(resp, compression)) elif input_type == 'ipynb': html, info = extract_ipynb(_to_memory(resp, compression)) elif input_type == 'parquet': html, info = extract_parquet(_to_memory(resp, compression)) elif input_type == 'vcf': html, info = extract_vcf( _from_stream(resp, compression, line_count)) elif input_type == 'txt': html, info = extract_txt( _from_stream(resp, compression, line_count)) else: assert False, f'unexpected input_type: {input_type}' assert isinstance(html, str), 'expected html parameter as string' assert isinstance(info, dict), 'expected info metadata to be a dict' ret_val = { 'info': info, 'html': html, } else: ret_val = {'error': resp.reason} return make_json_response(200, ret_val)
def test_json_response(self): """ Test make_json_response() """ status, body, headers = make_json_response(400, {'foo': 'bar'}) assert status == 400 assert json.loads(body) == {'foo': 'bar'} assert headers == {'Content-Type': 'application/json'} status, body, headers = make_json_response(200, {'foo': 'bar'}, {'Content-Length': '123'}) assert status == 200 assert json.loads(body) == {'foo': 'bar'} assert headers == {'Content-Type': 'application/json', 'Content-Length': '123'}
def lambda_handler(request): """ Generate thumbnails for images in S3 """ url = request.args['url'] size = SIZE_PARAMETER_MAP[request.args['size']] output = request.args.get('output', 'json') resp = requests.get(url) if resp.ok: image_bytes = BytesIO(resp.content) with Image.open(image_bytes) as image: orig_format = image.format orig_size = image.size if orig_format in SUPPORTED_BROWSER_FORMATS: thumnail_format = orig_format else: image = image.convert('RGBA') thumnail_format = 'PNG' image.thumbnail(size) thumbnail_size = image.size thumbnail_bytes = BytesIO() image.save(thumbnail_bytes, thumnail_format) data = thumbnail_bytes.getvalue() info = { 'original_format': orig_format, 'original_size': orig_size, 'thumbnail_format': thumnail_format, 'thumbnail_size': thumbnail_size, } if output == 'json': ret_val = { 'info': info, 'thumbnail': base64.b64encode(data).decode(), } return make_json_response(200, ret_val) else: headers = { 'Content-Type': Image.MIME[thumnail_format], 'X-Quilt-Info': json.dumps(info) } return 200, data, headers else: ret_val = {'error': resp.reason} return make_json_response(resp.status_code, ret_val)
def lambda_handler(request): """ Convert molecule formats """ url = request.args["url"] format_ = request.args["format"] resp = requests.get(url) if not resp.ok: # Errored, return error code ret_val = { "error": resp.reason, "text": resp.text, } return make_json_response(resp.status_code, ret_val) input_bytes = resp.content filename = urlparse(url).path.rpartition("/")[-1] input_base, input_ext = os.path.splitext(filename) if input_ext == ".gz": input_ext = os.path.splitext(input_base)[1] input_bytes = gzip.decompress(input_bytes) input_ext = input_ext[1:] p = subprocess.run( ( OBABEL, f"-i{input_ext}", f"-o{FORMATS[format_]}", ), check=False, input=input_bytes, capture_output=True, ) if p.returncode != 0: return make_json_response(403, {"error": p.stderr.decode()}) data = p.stdout headers = { "Content-Type": format_, "Content-Disposition": f'inline; filename="{input_base}.{FORMATS[format_]}"', } return 200, data, headers
def lambda_handler(request): """ Proxy the request to the elastic search. """ es_host = os.environ['ES_HOST'] region = os.environ['AWS_REGION'] auth = BotoAWSRequestsAuth(aws_host=es_host, aws_region=region, aws_service='es') es_client = Elasticsearch(hosts=[{ 'host': es_host, 'port': 443 }], http_auth=auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) index = request.pathParameters['proxy'] body = request.args.get('source') _source = request.args.get('_source') size = request.args.get('size', '1000') result = es_client.search(index, body, _source=_source, size=size, timeout=MAX_QUERY_DURATION) return make_json_response(200, result)
def lambda_handler(request): url = request.args["url"] input_type = request.args.get("input") output_size = request.args.get("size", "small") compression = request.args.get("compression") if not is_s3_url(url): return make_json_response( 400, {"title": "Invalid url=. Expected S3 virtual-host URL."}) handler = handlers[input_type] return handler(url, compression, OUTPUT_SIZES[output_size])
def _push_pkg_to_successor(data, *, get_src, get_dst, get_name, get_pkg, pkg_max_size, pkg_max_files): dst_registry = get_registry(get_dst(data)) src_registry = get_registry(get_src(data)) copy_data = _get_successor_params(src_registry, dst_registry).get('copy_data', True) try: pkg = get_pkg(src_registry, data) if copy_data: total_size = 0 total_files = 0 for lk, e in pkg.walk(): total_size += e.size if total_size > pkg_max_size: raise ApiException( HTTPStatus.BAD_REQUEST, f"Total package size is {total_size}, " f"but max supported size with `copy_data: true` is {pkg_max_size}" ) total_files += 1 if total_files > pkg_max_files: raise ApiException( HTTPStatus.BAD_REQUEST, f"Package has {total_files} files, " f"but max supported number with `copy_data: true` is {pkg_max_files}" ) meta = data.get('meta') if meta is None: pkg._meta.pop('user_meta', None) else: pkg.set_meta(meta) return make_json_response(200, { 'top_hash': pkg._push( name=get_name(data), registry=get_dst(data), message=data.get('message'), workflow=data.get('workflow', ...), selector_fn=None if copy_data else lambda *args: False, print_info=False, )._origin.top_hash, }, add_status=True) except quilt3.util.QuiltException as qe: raise ApiException(HTTPStatus.BAD_REQUEST, qe.message) except ClientError as boto_error: raise ApiException.from_botocore_error(boto_error) except quilt3.data_transfer.S3NoValidClientError as e: raise ApiException(HTTPStatus.FORBIDDEN, e.message)
def create_package(request): json_iterator = map(json.JSONDecoder().decode, (line.decode() for line in request.stream)) data = next(json_iterator) get_schema_validator(PACKAGE_CREATE_SCHEMA)(data) handle = data['name'] registry = data['registry'] try: package_registry = get_registry(registry) meta = data.get('meta') message = data.get('message') quilt3.util.validate_package_name(handle) pkg = quilt3.Package() if meta is not None: pkg.set_meta(meta) size_to_hash = 0 files_to_hash = 0 for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA), json_iterator): try: physical_key = PhysicalKey.from_url(entry['physical_key']) except ValueError: raise ApiException( HTTPStatus.BAD_REQUEST, f"{entry['physical_key']} is not a valid s3 URL.") if physical_key.is_local(): raise ApiException(HTTPStatus.BAD_REQUEST, f"{str(physical_key)} is not in S3.") logical_key = entry['logical_key'] hash_ = entry.get('hash') obj_size = entry.get('size') meta = entry.get('meta') if hash_ and obj_size is not None: pkg.set( logical_key, quilt3.packages.PackageEntry( physical_key, obj_size, { 'type': 'SHA256', 'value': hash_ }, meta, )) else: pkg.set(logical_key, str(physical_key), meta) size_to_hash += pkg[logical_key].size if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE: raise ApiException( HTTPStatus.BAD_REQUEST, f"Total size of new S3 files is {size_to_hash}, " f"but max supported size is {PKG_FROM_FOLDER_MAX_PKG_SIZE}" ) files_to_hash += 1 if files_to_hash > PKG_FROM_FOLDER_MAX_FILES: raise ApiException( HTTPStatus.BAD_REQUEST, f"Package has new S3 {files_to_hash} files, " f"but max supported number is {PKG_FROM_FOLDER_MAX_FILES}" ) pkg._validate_with_workflow( registry=package_registry, workflow=data.get('workflow', ...), name=handle, message=message, ) except quilt3.util.QuiltException as qe: raise ApiException(HTTPStatus.BAD_REQUEST, qe.message) calculate_pkg_hashes(user_boto_session, pkg) try: top_hash = pkg._build( name=handle, registry=registry, message=message, ) except ClientError as boto_error: raise ApiException.from_botocore_error(boto_error) return make_json_response(200, { 'top_hash': top_hash, })
def lambda_handler(request): """ Parse a manifest to return a folder-like view of its contents (logical keys). Returns: JSON response """ bucket = request.args['bucket'] key = request.args['manifest'] prefix = request.args.get('prefix') logical_key = request.args.get('logical_key') access_key = request.args.get('access_key') secret_key = request.args.get('secret_key') session_token = request.args.get('session_token') allow_anonymous_access = bool(os.getenv('ALLOW_ANONYMOUS_ACCESS')) # If credentials are passed in, use them # for the client. If no credentials are supplied, test that # the manifest object is publicly accessible. If so, create # an s3 client using the underlying IAM role's permissions. if access_key and secret_key and session_token: s3_client = create_s3_client(aws_access_key_id=access_key, aws_secret_access_key=secret_key, aws_session_token=session_token) elif (allow_anonymous_access and access_key is None and secret_key is None and session_token is None): # Test to see if the target key is publicly accessible. If not, the call # below will raise and exception and return a 403 response anons3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) try: anons3.head_object(Bucket=bucket, Key=key) except botocore.exceptions.ClientError as error: if error.response.get('Error'): code = error.response['Error']['Code'] if code == '403': return make_json_response( 403, { 'title': 'Access Denied', 'detail': f"Access denied reading manifest: {key}" }) raise error # Use the default S3 client configuration s3_client = boto3.client('s3') else: return make_json_response( 401, { 'title': 'Incomplete credentials', 'detail': "access_key, secret_key and session_token are required" }) assert s3_client # Get details of a single file in the package if logical_key is not None: sql_stmt = f"SELECT s.* FROM s3object s WHERE s.logical_key = '{sql_escape(logical_key)}' LIMIT 1" response_data = json.load( query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt)) else: # Call s3 select to fetch only logical keys matching the # desired prefix (folder path) prefix_length = len(prefix) if prefix is not None else 0 sql_stmt = ( f"SELECT SUBSTRING(s.logical_key, {prefix_length + 1}) AS logical_key" ", s.\"size\", s.physical_keys[0] as physical_key FROM s3object s") if prefix: sql_stmt += f" WHERE SUBSTRING(s.logical_key, 1, {prefix_length}) = '{sql_escape(prefix)}'" result = query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt) # Parse the response into a logical folder view df = pd.read_json(result, lines=True) response_data = file_list_to_folder(df) # Fetch package-level or directory-level metadata if prefix: sql_stmt = f"SELECT s.meta FROM s3object s WHERE s.logical_key = '{sql_escape(prefix)}'" else: sql_stmt = "SELECT s.* FROM s3object s WHERE s.logical_key is NULL" result = query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt) meta = json.load(result) if result else {} response_data.update(dict(meta=meta)) ret_val = make_json_response(200, {'contents': response_data}) return ret_val
def lambda_handler(request): """ Generate thumbnails for images in S3 """ # Parse request info url = request.args['url'] size = SIZE_PARAMETER_MAP[request.args['size']] output = request.args.get('output', 'json') # Handle request resp = requests.get(url) if resp.ok: # Get the original reader / format # If the original reader isn't in the supported formats map, use PNG as default presentation format try: thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(imageio.get_reader(resp.content), "PNG") # In the case imageio can't read the image, default to PNG # Usually an OME-TIFF / CZI / some other bio-format except ValueError: thumbnail_format = "PNG" # Read image data img = AICSImage(resp.content) orig_size = list(img.reader.data.shape) # Generate a formatted ndarray using the image data # Makes some assumptions for n-dim data img = format_aicsimage_to_prepped(img) # Send to Image object for thumbnail generation and saving to bytes img = Image.fromarray(img) # Generate thumbnail img.thumbnail(size) thumbnail_size = img.size # Store the bytes thumbnail_bytes = BytesIO() img.save(thumbnail_bytes, thumbnail_format) # Get bytes data data = thumbnail_bytes.getvalue() # Create metadata object info = { 'original_size': orig_size, 'thumbnail_format': thumbnail_format, 'thumbnail_size': thumbnail_size, } # Store data if output == 'json': ret_val = { 'info': info, 'thumbnail': base64.b64encode(data).decode(), } return make_json_response(200, ret_val) # Not json response headers = { 'Content-Type': Image.MIME[thumbnail_format], 'X-Quilt-Info': json.dumps(info) } return 200, data, headers # Errored, return error code ret_val = { 'error': resp.reason } return make_json_response(resp.status_code, ret_val)
def lambda_handler(request): """ Generate thumbnails for images in S3 """ # Parse request info url = request.args['url'] size = SIZE_PARAMETER_MAP[request.args['size']] input_ = request.args.get('input', 'image') output = request.args.get('output', 'json') page = int(request.args.get('page', '1')) count_pages = request.args.get('countPages') == 'true' # Handle request resp = requests.get(url) if resp.ok: try: thumbnail_format = SUPPORTED_BROWSER_FORMATS.get( imageio.get_reader(resp.content), "PNG") except ValueError: thumbnail_format = "JPEG" if input_ == "pdf" else "PNG" if input_ == "pdf": set_pdf_env() try: kwargs = { # respect width but not necessarily height to preserve aspect ratio "size": (size[0], None), "fmt": "JPEG", } if not count_pages: kwargs["first_page"] = page kwargs["last_page"] = page pages = convert_from_bytes( resp.content, **kwargs, ) num_pages = len(pages) preview = pages[0] if count_pages: # shift 1-index to 0-index preview = pages[page - 1] except (IndexError, PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError, PopplerNotInstalledError) as exc: return make_json_response(500, {'error': str(exc)}) info = { 'thumbnail_format': 'JPEG', 'thumbnail_size': preview.size, } if count_pages: info['page_count'] = num_pages thumbnail_bytes = BytesIO() preview.save(thumbnail_bytes, thumbnail_format) data = thumbnail_bytes.getvalue() else: # Read image data img = AICSImage(resp.content) orig_size = list(img.reader.data.shape) # Generate a formatted ndarray using the image data # Makes some assumptions for n-dim data img = format_aicsimage_to_prepped(img) # Send to Image object for thumbnail generation and saving to bytes img = Image.fromarray(img) # Generate thumbnail img.thumbnail(size) thumbnail_size = img.size # Store the bytes thumbnail_bytes = BytesIO() img.save(thumbnail_bytes, thumbnail_format) # Get bytes data data = thumbnail_bytes.getvalue() # Create metadata object info = { 'original_size': orig_size, 'thumbnail_format': thumbnail_format, 'thumbnail_size': thumbnail_size, } if output == 'json': ret_val = { 'info': info, 'thumbnail': base64.b64encode(data).decode(), } return make_json_response(200, ret_val) # Not JSON response ('raw') headers = { 'Content-Type': Image.MIME[thumbnail_format], QUILT_INFO_HEADER: json.dumps(info) } return 200, data, headers # Errored, return error code ret_val = { 'error': resp.reason, 'text': resp.text, } return make_json_response(resp.status_code, ret_val)
def wrapper(*args, **kwargs): try: return f(*args, **kwargs) except exception_types as e: return make_json_response(500, {'error': str(e)})
def lambda_handler(request): """ Proxy the request to the elastic search. """ action = request.args.get('action') indexes = request.args.get('index') terminate_after = os.getenv('MAX_DOCUMENTS_PER_SHARD') if action == 'search': query = request.args.get('query', '') body = { "query": { "simple_query_string": { "query": query, "fields": ['content', 'comment', 'key_text', 'meta_text'] } } } # TODO: should be user settable; we should proably forbid `content` (can be huge) _source = [ 'key', 'version_id', 'updated', 'last_modified', 'size', 'user_meta' ] size = 1000 elif action == 'stats': body = { "query": { "match_all": {} }, "aggs": { "totalBytes": { "sum": { "field": 'size' } }, "exts": { "terms": { "field": 'ext' }, "aggs": { "size": { "sum": { "field": 'size' } } }, }, } } size = 0 # We still get all aggregates, just don't need the results _source = [] # Consider all documents when computing counts, etc. terminate_after = None elif action == 'images': body = { 'query': { 'terms': { 'ext': IMG_EXTS } }, 'collapse': { 'field': 'key', 'inner_hits': { 'name': 'latest', 'size': 1, 'sort': [{ 'last_modified': 'desc' }], '_source': ['key', 'version_id'], }, }, } size = NUM_PREVIEW_IMAGES _source = [] elif action == 'sample': body = { 'query': { 'bool': { 'must': [{ 'terms': { 'ext': SAMPLE_EXTS } }], 'must_not': [ { 'terms': { 'key': README_KEYS + [SUMMARIZE_KEY] } }, { 'wildcard': { 'key': '*/' + SUMMARIZE_KEY } }, ], }, }, 'collapse': { 'field': 'key', 'inner_hits': { 'name': 'latest', 'size': 1, 'sort': [{ 'last_modified': 'desc' }], '_source': ['key', 'version_id'], }, }, } size = NUM_PREVIEW_FILES _source = [] else: return make_json_response(400, {"title": "Invalid action"}) es_host = os.environ['ES_HOST'] region = os.environ['AWS_REGION'] index_overrides = os.getenv('INDEX_OVERRIDES', '') auth = BotoAWSRequestsAuth(aws_host=es_host, aws_region=region, aws_service='es') es_client = Elasticsearch(hosts=[{ 'host': es_host, 'port': 443 }], http_auth=auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) to_search = f"{indexes},{index_overrides}" if index_overrides else indexes result = es_client.search(to_search, body, _source=_source, size=size, terminate_after=terminate_after, timeout=MAX_QUERY_DURATION) return make_json_response(200, post_process(result, action))
def lambda_handler(request): """ Generate previews for videos in S3 """ url = request.args['url'] format = request.args['format'] def _parse_param(name, default_value, min_value, max_value): value_str = request.args.get(name) if value_str is None: return default_value try: value = type(default_value)(value_str) if not min_value <= value <= max_value: raise ValueError return value except ValueError: raise ValueError( f"Invalid {name!r}; must be between {min_value} and {max_value} inclusive" ) try: width = _parse_param('width', 320, 10, 640) height = _parse_param('height', 240, 10, 480) duration = _parse_param('duration', 5.0, 0.1, 30) audio_bitrate = _parse_param('audio_bitrate', 128, 64, 320) file_size = _parse_param('file_size', MAX_FILE_SIZE, 1024, MAX_FILE_SIZE) except ValueError as ex: return make_json_response(400, {'error': str(ex)}) category = format.split('/')[0] format_params = [] if category == 'audio': format_params.extend([ '-b:a', f'{audio_bitrate}k', '-vn', # Drop the video stream ]) elif category == 'video': format_params.extend([ "-vf", ','.join([ f"scale=w={width}:h={height}:force_original_aspect_ratio=decrease", "crop='iw-mod(iw\\,2)':'ih-mod(ih\\,2)'", ]), ]) with tempfile.NamedTemporaryFile() as output_file: p = subprocess.run( [ FFMPEG, "-t", str(duration), "-i", url, "-f", FORMATS[format], *format_params, "-timelimit", str(request.context.get_remaining_time_in_millis() // 1000 - 2), # 2 seconds for padding "-fs", str(file_size), "-y", # Overwrite output file "-v", "error", # Only print errors output_file.name ], check=False, stdin=subprocess.DEVNULL, stderr=subprocess.PIPE) if p.returncode != 0: return make_json_response(403, {'error': p.stderr.decode()}) data = output_file.read() parsed = urlparse(url) filename = parsed.path.rpartition('/')[-1] headers = { 'Content-Type': format, 'Title': f"Preview of {filename}", 'Content-Disposition': f'inline; filename="{filename}"', } return 200, data, headers
def lambda_handler(request): """ Proxy the request to the elastic search. """ action = request.args.get('action') user_body = request.args.get('body', {}) user_fields = request.args.get('fields', []) user_indexes = request.args.get('index', "") user_size = request.args.get('size', DEFAULT_SIZE) user_source = request.args.get('_source', []) # 0-indexed starting position (for pagination) user_from = int(request.args.get('from', 0)) user_retry = int(request.args.get('retry', 0)) terminate_after = int(os.environ.get('MAX_DOCUMENTS_PER_SHARD', 10_000)) if not user_indexes or not isinstance(user_indexes, str): raise ValueError( "Request must include index=<comma-separated string of indices>") if user_from < 0: raise ValueError("'from' must be a non-negative integer") if action == 'packages': query = request.args.get('query', '') body = user_body or { "query": { "query_string": { "analyze_wildcard": True, "lenient": True, "query": query, # see enterprise/**/bucket.py for mappings "fields": user_fields or [ # package 'comment', 'handle', 'handle_text^2', 'metadata', 'tags' ] } } } if not all(i.endswith('_packages') for i in user_indexes.split(',')): raise ValueError( "'packages' action searching indexes that don't end in '_packages'" ) _source = user_source size = user_size terminate_after = None elif action == 'search': query = request.args.get('query', '') my_fields = user_fields or [ # object 'content', 'comment', 'ext', 'key', 'key_text', 'meta_text', # package, and boost the fields 'handle^2', 'handle_text^2', 'metadata^2', 'tags^2' ] if user_retry <= 1: body = { "query": { "query_string": { "analyze_wildcard": True, "lenient": user_retry > 0, "query": query, # more precise searches vs OR "default_operator": "AND", # see enterprise/**/bucket.py for mappings "fields": my_fields } } } else: body = { "query": { "simple_query_string": { "query": query, "analyze_wildcard": user_retry < 3, "default_operator": "AND", "fields": my_fields, "lenient": True, } } } _source = user_source or [ 'key', 'version_id', 'updated', 'last_modified', 'size', 'user_meta', 'comment', 'handle', 'hash', 'tags', 'metadata', 'pointer_file' ] size = DEFAULT_SIZE elif action == 'stats': body = { "query": { "match_all": {} }, "aggs": { "totalBytes": { "sum": { "field": 'size' } }, "exts": { "terms": { "field": 'ext' }, "aggs": { "size": { "sum": { "field": 'size' } } }, }, "totalPackageHandles": { "value_count": { "field": "handle" } }, } } size = 0 # We still get all aggregates, just don't need the results _source = False # Consider all documents when computing counts, etc. terminate_after = None elif action == 'images': body = { 'query': { 'regexp': { 'ext': IMG_EXTS } }, 'collapse': { 'field': 'key', 'inner_hits': { 'name': 'latest', 'size': 1, 'sort': [{ 'last_modified': 'desc' }], '_source': ['key', 'version_id'], }, }, } size = NUM_PREVIEW_IMAGES _source = False elif action == 'sample': body = { 'query': { 'bool': { 'must': [{ 'regexp': { 'ext': SAMPLE_EXTS } }], 'must_not': [ { 'terms': { 'key': README_KEYS + [SUMMARIZE_KEY] } }, { 'wildcard': { 'key': '*/' + SUMMARIZE_KEY } }, ], }, }, 'collapse': { 'field': 'key', 'inner_hits': { 'name': 'latest', 'size': 1, 'sort': [{ 'last_modified': 'desc' }], '_source': ['key', 'version_id'], }, }, } size = NUM_PREVIEW_FILES _source = False else: return make_json_response(400, {"title": "Invalid action"}) es_host = os.environ['ES_HOST'] region = os.environ['AWS_REGION'] index_overrides = os.getenv('INDEX_OVERRIDES', '') auth = BotoAWSRequestsAuth(aws_host=es_host, aws_region=region, aws_service='es') es_client = Elasticsearch( hosts=[{ 'host': es_host, 'port': 443 }], http_auth=auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, timeout=MAX_QUERY_DURATION, ) to_search = f"{user_indexes},{index_overrides}" if index_overrides else user_indexes result = es_client.search( index=to_search, body=body, _source=_source, size=size, from_=user_from, # try turning this off to consider all documents terminate_after=terminate_after, ) return make_json_response(200, post_process(result, action))
def lambda_handler(request): """ Proxy the request to the elastic search. """ action = request.args.get('action') indexes = request.args.get('index') if action == 'search': query = request.args.get('query', '') body = { "query": { "simple_query_string": { "query": query, "fields": ['content', 'comment', 'key_text', 'meta_text'] } } } # TODO: should be user settable; we should proably forbid `content` (can be huge) _source = [ 'key', 'version_id', 'updated', 'last_modified', 'size', 'user_meta' ] size = 1000 elif action == 'stats': body = { "query": { "match_all": {} }, "aggs": { "totalBytes": { "sum": { "field": 'size' } }, "exts": { "terms": { "field": 'ext' }, "aggs": { "size": { "sum": { "field": 'size' } } }, }, "updated": { "max": { "field": 'updated' } }, } } size = 0 _source = [] else: return make_json_response(400, {"title": "Invalid action"}) es_host = os.environ['ES_HOST'] region = os.environ['AWS_REGION'] auth = BotoAWSRequestsAuth(aws_host=es_host, aws_region=region, aws_service='es') es_client = Elasticsearch(hosts=[{ 'host': es_host, 'port': 443 }], http_auth=auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) to_search = f"{indexes},{INDEX_OVERRIDES}" if INDEX_OVERRIDES else indexes result = es_client.search(to_search, body, _source=_source, size=size, timeout=MAX_QUERY_DURATION) return make_json_response(200, result)
def lambda_handler(request): """ dynamically handle preview requests for bytes in S3 caller must specify input_type (since there may be no file extension) Returns: JSON response """ url = request.args['url'] input_type = request.args.get('input') compression = request.args.get('compression') separator = request.args.get('sep') or ',' exclude_output = request.args.get('exclude_output') == 'true' try: max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES)) except ValueError as error: return make_json_response(400, { 'title': 'Unexpected max_bytes= value', 'detail': str(error) }) parsed_url = urlparse(url, allow_fragments=False) if not (parsed_url.scheme == 'https' and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and parsed_url.username is None and parsed_url.password is None): return make_json_response( 400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'}) try: line_count = _str_to_line_count( request.args.get('line_count', str(CATALOG_LIMIT_LINES))) except ValueError as error: # format https://jsonapi.org/format/1.1/#error-objects return make_json_response(400, { 'title': 'Unexpected line_count= value', 'detail': str(error) }) # stream=True saves memory almost equal to file size resp = requests.get(url, stream=True) if resp.ok: content_iter = resp.iter_content(CHUNK) if input_type == 'csv': html, info = extract_csv( get_preview_lines(content_iter, compression, line_count, max_bytes), separator) elif input_type == 'excel': html, info = extract_excel(get_bytes(content_iter, compression)) elif input_type == 'fcs': html, info = extract_fcs(get_bytes(content_iter, compression)) elif input_type == 'ipynb': html, info = extract_ipynb(get_bytes(content_iter, compression), exclude_output) elif input_type == 'parquet': html, info = extract_parquet(get_bytes(content_iter, compression)) elif input_type == 'vcf': html, info = extract_vcf( get_preview_lines(content_iter, compression, line_count, max_bytes)) elif input_type in TEXT_TYPES: html, info = extract_txt( get_preview_lines(content_iter, compression, line_count, max_bytes)) else: assert False, f'unexpected input_type: {input_type}' assert isinstance(html, str), 'expected html parameter as string' assert isinstance(info, dict), 'expected info metadata to be a dict' ret_val = { 'info': info, 'html': html, } else: ret_val = { 'error': resp.reason, 'text': resp.text, } return make_json_response(resp.status_code, ret_val)
def lambda_handler(request): """ Generate thumbnails for images in S3 """ # Parse request info url = request.args['url'] size = SIZE_PARAMETER_MAP[request.args['size']] input_ = request.args.get('input', 'image') output = request.args.get('output', 'json') page = int(request.args.get('page', '1')) count_pages = request.args.get('countPages') == 'true' # Handle request resp = requests.get(url) if not resp.ok: # Errored, return error code ret_val = { 'error': resp.reason, 'text': resp.text, } return make_json_response(resp.status_code, ret_val) src_bytes = resp.content try: thumbnail_format = SUPPORTED_BROWSER_FORMATS.get( imageio.get_reader(src_bytes), "PNG") except ValueError: thumbnail_format = "JPEG" if input_ in ("pdf", "pptx") else "PNG" if input_ == "pdf": info, data = handle_pdf(src=src_bytes, page=page, size=size[0], count_pages=count_pages) elif input_ == "pptx": info, data = handle_pptx(src=src_bytes, page=page, size=size[0], count_pages=count_pages) else: # Read image data img = AICSImage(src_bytes) orig_size = list(img.reader.data.shape) # Generate a formatted ndarray using the image data # Makes some assumptions for n-dim data img = format_aicsimage_to_prepped(img) img = generate_thumbnail(img, size) thumbnail_size = img.size # Store the bytes thumbnail_bytes = BytesIO() img.save(thumbnail_bytes, thumbnail_format) # Get bytes data data = thumbnail_bytes.getvalue() # Create metadata object info = { 'original_size': orig_size, 'thumbnail_format': thumbnail_format, 'thumbnail_size': thumbnail_size, } headers = { 'Content-Type': Image.MIME[thumbnail_format], QUILT_INFO_HEADER: json.dumps(info) } return 200, data, headers