示例#1
0
def lambda_handler(request):
    """
    dynamically handle preview requests for bytes in S3
    caller must specify input_type (since there may be no file extension)

    Returns:
        JSON response
    """
    url = request.args['url']
    input_type = request.args.get('input')
    compression = request.args.get('compression')

    parsed_url = urlparse(url, allow_fragments=False)
    if not (parsed_url.scheme == 'https'
            and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
            and parsed_url.username is None and parsed_url.password is None):
        return make_json_response(
            400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'})

    try:
        line_count = _str_to_line_count(
            request.args.get('line_count', str(MAX_LINES)))
    except ValueError as error:
        # format https://jsonapi.org/format/1.1/#error-objects
        return make_json_response(400, {
            'title': f'Unexpected line_count= value',
            'detail': str(error)
        })

    # stream=True saves memory almost equal to file size
    resp = requests.get(url, stream=True)
    if resp.ok:
        if input_type == 'excel':
            html, info = extract_excel(_to_memory(resp, compression))
        elif input_type == 'ipynb':
            html, info = extract_ipynb(_to_memory(resp, compression))
        elif input_type == 'parquet':
            html, info = extract_parquet(_to_memory(resp, compression))
        elif input_type == 'vcf':
            html, info = extract_vcf(
                _from_stream(resp, compression, line_count))
        elif input_type == 'txt':
            html, info = extract_txt(
                _from_stream(resp, compression, line_count))
        else:
            assert False, f'unexpected input_type: {input_type}'

        assert isinstance(html, str), 'expected html parameter as string'
        assert isinstance(info, dict), 'expected info metadata to be a dict'

        ret_val = {
            'info': info,
            'html': html,
        }

    else:
        ret_val = {'error': resp.reason}

    return make_json_response(200, ret_val)
示例#2
0
文件: test_utils.py 项目: zkan/quilt
    def test_json_response(self):
        """
        Test make_json_response()
        """
        status, body, headers = make_json_response(400, {'foo': 'bar'})
        assert status == 400
        assert json.loads(body) == {'foo': 'bar'}
        assert headers == {'Content-Type': 'application/json'}

        status, body, headers = make_json_response(200, {'foo': 'bar'}, {'Content-Length': '123'})
        assert status == 200
        assert json.loads(body) == {'foo': 'bar'}
        assert headers == {'Content-Type': 'application/json', 'Content-Length': '123'}
示例#3
0
def lambda_handler(request):
    """
    Generate thumbnails for images in S3
    """
    url = request.args['url']
    size = SIZE_PARAMETER_MAP[request.args['size']]
    output = request.args.get('output', 'json')

    resp = requests.get(url)
    if resp.ok:
        image_bytes = BytesIO(resp.content)
        with Image.open(image_bytes) as image:
            orig_format = image.format
            orig_size = image.size

            if orig_format in SUPPORTED_BROWSER_FORMATS:
                thumnail_format = orig_format
            else:
                image = image.convert('RGBA')
                thumnail_format = 'PNG'

            image.thumbnail(size)
            thumbnail_size = image.size
            thumbnail_bytes = BytesIO()
            image.save(thumbnail_bytes, thumnail_format)

        data = thumbnail_bytes.getvalue()

        info = {
            'original_format': orig_format,
            'original_size': orig_size,
            'thumbnail_format': thumnail_format,
            'thumbnail_size': thumbnail_size,
        }

        if output == 'json':
            ret_val = {
                'info': info,
                'thumbnail': base64.b64encode(data).decode(),
            }
            return make_json_response(200, ret_val)
        else:
            headers = {
                'Content-Type': Image.MIME[thumnail_format],
                'X-Quilt-Info': json.dumps(info)
            }
            return 200, data, headers

    else:
        ret_val = {'error': resp.reason}
        return make_json_response(resp.status_code, ret_val)
示例#4
0
def lambda_handler(request):
    """
    Convert molecule formats
    """
    url = request.args["url"]
    format_ = request.args["format"]

    resp = requests.get(url)
    if not resp.ok:
        # Errored, return error code
        ret_val = {
            "error": resp.reason,
            "text": resp.text,
        }
        return make_json_response(resp.status_code, ret_val)
    input_bytes = resp.content

    filename = urlparse(url).path.rpartition("/")[-1]
    input_base, input_ext = os.path.splitext(filename)
    if input_ext == ".gz":
        input_ext = os.path.splitext(input_base)[1]
        input_bytes = gzip.decompress(input_bytes)
    input_ext = input_ext[1:]

    p = subprocess.run(
        (
            OBABEL,
            f"-i{input_ext}",
            f"-o{FORMATS[format_]}",
        ),
        check=False,
        input=input_bytes,
        capture_output=True,
    )

    if p.returncode != 0:
        return make_json_response(403, {"error": p.stderr.decode()})

    data = p.stdout

    headers = {
        "Content-Type":
        format_,
        "Content-Disposition":
        f'inline; filename="{input_base}.{FORMATS[format_]}"',
    }

    return 200, data, headers
示例#5
0
def lambda_handler(request):
    """
    Proxy the request to the elastic search.
    """
    es_host = os.environ['ES_HOST']
    region = os.environ['AWS_REGION']

    auth = BotoAWSRequestsAuth(aws_host=es_host,
                               aws_region=region,
                               aws_service='es')

    es_client = Elasticsearch(hosts=[{
        'host': es_host,
        'port': 443
    }],
                              http_auth=auth,
                              use_ssl=True,
                              verify_certs=True,
                              connection_class=RequestsHttpConnection)

    index = request.pathParameters['proxy']
    body = request.args.get('source')
    _source = request.args.get('_source')
    size = request.args.get('size', '1000')

    result = es_client.search(index,
                              body,
                              _source=_source,
                              size=size,
                              timeout=MAX_QUERY_DURATION)

    return make_json_response(200, result)
示例#6
0
def lambda_handler(request):
    url = request.args["url"]
    input_type = request.args.get("input")
    output_size = request.args.get("size", "small")
    compression = request.args.get("compression")

    if not is_s3_url(url):
        return make_json_response(
            400, {"title": "Invalid url=. Expected S3 virtual-host URL."})

    handler = handlers[input_type]
    return handler(url, compression, OUTPUT_SIZES[output_size])
示例#7
0
文件: __init__.py 项目: akarve/quilt
def _push_pkg_to_successor(data, *, get_src, get_dst, get_name, get_pkg,
                           pkg_max_size, pkg_max_files):
    dst_registry = get_registry(get_dst(data))
    src_registry = get_registry(get_src(data))
    copy_data = _get_successor_params(src_registry,
                                      dst_registry).get('copy_data', True)

    try:
        pkg = get_pkg(src_registry, data)
        if copy_data:
            total_size = 0
            total_files = 0
            for lk, e in pkg.walk():
                total_size += e.size
                if total_size > pkg_max_size:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Total package size is {total_size}, "
                        f"but max supported size with `copy_data: true` is {pkg_max_size}"
                    )
                total_files += 1
                if total_files > pkg_max_files:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Package has {total_files} files, "
                        f"but max supported number with `copy_data: true` is {pkg_max_files}"
                    )
        meta = data.get('meta')
        if meta is None:
            pkg._meta.pop('user_meta', None)
        else:
            pkg.set_meta(meta)
        return make_json_response(200, {
            'top_hash':
            pkg._push(
                name=get_name(data),
                registry=get_dst(data),
                message=data.get('message'),
                workflow=data.get('workflow', ...),
                selector_fn=None if copy_data else lambda *args: False,
                print_info=False,
            )._origin.top_hash,
        },
                                  add_status=True)
    except quilt3.util.QuiltException as qe:
        raise ApiException(HTTPStatus.BAD_REQUEST, qe.message)
    except ClientError as boto_error:
        raise ApiException.from_botocore_error(boto_error)
    except quilt3.data_transfer.S3NoValidClientError as e:
        raise ApiException(HTTPStatus.FORBIDDEN, e.message)
示例#8
0
文件: __init__.py 项目: akarve/quilt
def create_package(request):
    json_iterator = map(json.JSONDecoder().decode,
                        (line.decode() for line in request.stream))

    data = next(json_iterator)
    get_schema_validator(PACKAGE_CREATE_SCHEMA)(data)
    handle = data['name']
    registry = data['registry']

    try:
        package_registry = get_registry(registry)

        meta = data.get('meta')
        message = data.get('message')
        quilt3.util.validate_package_name(handle)
        pkg = quilt3.Package()
        if meta is not None:
            pkg.set_meta(meta)

        size_to_hash = 0
        files_to_hash = 0
        for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA),
                         json_iterator):
            try:
                physical_key = PhysicalKey.from_url(entry['physical_key'])
            except ValueError:
                raise ApiException(
                    HTTPStatus.BAD_REQUEST,
                    f"{entry['physical_key']} is not a valid s3 URL.")
            if physical_key.is_local():
                raise ApiException(HTTPStatus.BAD_REQUEST,
                                   f"{str(physical_key)} is not in S3.")
            logical_key = entry['logical_key']

            hash_ = entry.get('hash')
            obj_size = entry.get('size')
            meta = entry.get('meta')

            if hash_ and obj_size is not None:
                pkg.set(
                    logical_key,
                    quilt3.packages.PackageEntry(
                        physical_key,
                        obj_size,
                        {
                            'type': 'SHA256',
                            'value': hash_
                        },
                        meta,
                    ))
            else:
                pkg.set(logical_key, str(physical_key), meta)

                size_to_hash += pkg[logical_key].size
                if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Total size of new S3 files is {size_to_hash}, "
                        f"but max supported size is {PKG_FROM_FOLDER_MAX_PKG_SIZE}"
                    )

                files_to_hash += 1
                if files_to_hash > PKG_FROM_FOLDER_MAX_FILES:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Package has new S3 {files_to_hash} files, "
                        f"but max supported number is {PKG_FROM_FOLDER_MAX_FILES}"
                    )

        pkg._validate_with_workflow(
            registry=package_registry,
            workflow=data.get('workflow', ...),
            name=handle,
            message=message,
        )

    except quilt3.util.QuiltException as qe:
        raise ApiException(HTTPStatus.BAD_REQUEST, qe.message)

    calculate_pkg_hashes(user_boto_session, pkg)
    try:
        top_hash = pkg._build(
            name=handle,
            registry=registry,
            message=message,
        )
    except ClientError as boto_error:
        raise ApiException.from_botocore_error(boto_error)

    return make_json_response(200, {
        'top_hash': top_hash,
    })
示例#9
0
def lambda_handler(request):
    """
    Parse a manifest to return a folder-like view of its contents (logical keys).

    Returns:
        JSON response
    """
    bucket = request.args['bucket']
    key = request.args['manifest']
    prefix = request.args.get('prefix')
    logical_key = request.args.get('logical_key')
    access_key = request.args.get('access_key')
    secret_key = request.args.get('secret_key')
    session_token = request.args.get('session_token')
    allow_anonymous_access = bool(os.getenv('ALLOW_ANONYMOUS_ACCESS'))

    # If credentials are passed in, use them
    # for the client. If no credentials are supplied, test that
    # the manifest object is publicly accessible. If so, create
    # an s3 client using the underlying IAM role's permissions.

    if access_key and secret_key and session_token:
        s3_client = create_s3_client(aws_access_key_id=access_key,
                                     aws_secret_access_key=secret_key,
                                     aws_session_token=session_token)
    elif (allow_anonymous_access and access_key is None and secret_key is None
          and session_token is None):
        # Test to see if the target key is publicly accessible. If not, the call
        # below will raise and exception and return a 403 response
        anons3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
        try:
            anons3.head_object(Bucket=bucket, Key=key)
        except botocore.exceptions.ClientError as error:
            if error.response.get('Error'):
                code = error.response['Error']['Code']
                if code == '403':
                    return make_json_response(
                        403, {
                            'title': 'Access Denied',
                            'detail': f"Access denied reading manifest: {key}"
                        })
            raise error

        # Use the default S3 client configuration
        s3_client = boto3.client('s3')
    else:
        return make_json_response(
            401, {
                'title': 'Incomplete credentials',
                'detail':
                "access_key, secret_key and session_token are required"
            })
    assert s3_client

    # Get details of a single file in the package
    if logical_key is not None:
        sql_stmt = f"SELECT s.* FROM s3object s WHERE s.logical_key = '{sql_escape(logical_key)}' LIMIT 1"
        response_data = json.load(
            query_manifest_content(s3_client,
                                   bucket=bucket,
                                   key=key,
                                   sql_stmt=sql_stmt))
    else:
        # Call s3 select to fetch only logical keys matching the
        # desired prefix (folder path)
        prefix_length = len(prefix) if prefix is not None else 0
        sql_stmt = (
            f"SELECT SUBSTRING(s.logical_key, {prefix_length + 1}) AS logical_key"
            ", s.\"size\", s.physical_keys[0] as physical_key FROM s3object s")
        if prefix:
            sql_stmt += f" WHERE SUBSTRING(s.logical_key, 1, {prefix_length}) = '{sql_escape(prefix)}'"
        result = query_manifest_content(s3_client,
                                        bucket=bucket,
                                        key=key,
                                        sql_stmt=sql_stmt)
        # Parse the response into a logical folder view
        df = pd.read_json(result, lines=True)
        response_data = file_list_to_folder(df)

        # Fetch package-level or directory-level metadata
        if prefix:
            sql_stmt = f"SELECT s.meta FROM s3object s WHERE s.logical_key = '{sql_escape(prefix)}'"
        else:
            sql_stmt = "SELECT s.* FROM s3object s WHERE s.logical_key is NULL"
        result = query_manifest_content(s3_client,
                                        bucket=bucket,
                                        key=key,
                                        sql_stmt=sql_stmt)
        meta = json.load(result) if result else {}
        response_data.update(dict(meta=meta))

    ret_val = make_json_response(200, {'contents': response_data})

    return ret_val
示例#10
0
文件: index.py 项目: viveklak/quilt
def lambda_handler(request):
    """
    Generate thumbnails for images in S3
    """
    # Parse request info
    url = request.args['url']
    size = SIZE_PARAMETER_MAP[request.args['size']]
    output = request.args.get('output', 'json')

    # Handle request
    resp = requests.get(url)
    if resp.ok:
        # Get the original reader / format
        # If the original reader isn't in the supported formats map, use PNG as default presentation format
        try:
            thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(imageio.get_reader(resp.content), "PNG")
        # In the case imageio can't read the image, default to PNG
        # Usually an OME-TIFF / CZI / some other bio-format
        except ValueError:
            thumbnail_format = "PNG"

        # Read image data
        img = AICSImage(resp.content)
        orig_size = list(img.reader.data.shape)

        # Generate a formatted ndarray using the image data
        # Makes some assumptions for n-dim data
        img = format_aicsimage_to_prepped(img)

        # Send to Image object for thumbnail generation and saving to bytes
        img = Image.fromarray(img)

        # Generate thumbnail
        img.thumbnail(size)
        thumbnail_size = img.size

        # Store the bytes
        thumbnail_bytes = BytesIO()
        img.save(thumbnail_bytes, thumbnail_format)

        # Get bytes data
        data = thumbnail_bytes.getvalue()

        # Create metadata object
        info = {
            'original_size': orig_size,
            'thumbnail_format': thumbnail_format,
            'thumbnail_size': thumbnail_size,
        }

        # Store data
        if output == 'json':
            ret_val = {
                'info': info,
                'thumbnail': base64.b64encode(data).decode(),
            }
            return make_json_response(200, ret_val)

        # Not json response
        headers = {
            'Content-Type': Image.MIME[thumbnail_format],
            'X-Quilt-Info': json.dumps(info)
        }
        return 200, data, headers

    # Errored, return error code
    ret_val = {
        'error': resp.reason
    }
    return make_json_response(resp.status_code, ret_val)
示例#11
0
文件: index.py 项目: Laeeth/quilt
def lambda_handler(request):
    """
    Generate thumbnails for images in S3
    """
    # Parse request info
    url = request.args['url']
    size = SIZE_PARAMETER_MAP[request.args['size']]
    input_ = request.args.get('input', 'image')
    output = request.args.get('output', 'json')
    page = int(request.args.get('page', '1'))
    count_pages = request.args.get('countPages') == 'true'

    # Handle request
    resp = requests.get(url)
    if resp.ok:
        try:
            thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(
                imageio.get_reader(resp.content), "PNG")
        except ValueError:
            thumbnail_format = "JPEG" if input_ == "pdf" else "PNG"
        if input_ == "pdf":
            set_pdf_env()
            try:
                kwargs = {
                    # respect width but not necessarily height to preserve aspect ratio
                    "size": (size[0], None),
                    "fmt": "JPEG",
                }
                if not count_pages:
                    kwargs["first_page"] = page
                    kwargs["last_page"] = page

                pages = convert_from_bytes(
                    resp.content,
                    **kwargs,
                )
                num_pages = len(pages)
                preview = pages[0]
                if count_pages:
                    # shift 1-index to 0-index
                    preview = pages[page - 1]
            except (IndexError, PDFInfoNotInstalledError, PDFPageCountError,
                    PDFSyntaxError, PopplerNotInstalledError) as exc:
                return make_json_response(500, {'error': str(exc)})

            info = {
                'thumbnail_format': 'JPEG',
                'thumbnail_size': preview.size,
            }
            if count_pages:
                info['page_count'] = num_pages

            thumbnail_bytes = BytesIO()
            preview.save(thumbnail_bytes, thumbnail_format)
            data = thumbnail_bytes.getvalue()
        else:
            # Read image data
            img = AICSImage(resp.content)
            orig_size = list(img.reader.data.shape)
            # Generate a formatted ndarray using the image data
            # Makes some assumptions for n-dim data
            img = format_aicsimage_to_prepped(img)
            # Send to Image object for thumbnail generation and saving to bytes
            img = Image.fromarray(img)
            # Generate thumbnail
            img.thumbnail(size)
            thumbnail_size = img.size
            # Store the bytes
            thumbnail_bytes = BytesIO()
            img.save(thumbnail_bytes, thumbnail_format)
            # Get bytes data
            data = thumbnail_bytes.getvalue()
            # Create metadata object
            info = {
                'original_size': orig_size,
                'thumbnail_format': thumbnail_format,
                'thumbnail_size': thumbnail_size,
            }

        if output == 'json':
            ret_val = {
                'info': info,
                'thumbnail': base64.b64encode(data).decode(),
            }
            return make_json_response(200, ret_val)
        # Not JSON response ('raw')
        headers = {
            'Content-Type': Image.MIME[thumbnail_format],
            QUILT_INFO_HEADER: json.dumps(info)
        }
        return 200, data, headers

    # Errored, return error code
    ret_val = {
        'error': resp.reason,
        'text': resp.text,
    }
    return make_json_response(resp.status_code, ret_val)
示例#12
0
 def wrapper(*args, **kwargs):
     try:
         return f(*args, **kwargs)
     except exception_types as e:
         return make_json_response(500, {'error': str(e)})
示例#13
0
文件: index.py 项目: knut0815/quilt
def lambda_handler(request):
    """
    Proxy the request to the elastic search.
    """

    action = request.args.get('action')
    indexes = request.args.get('index')
    terminate_after = os.getenv('MAX_DOCUMENTS_PER_SHARD')

    if action == 'search':
        query = request.args.get('query', '')
        body = {
            "query": {
                "simple_query_string": {
                    "query": query,
                    "fields": ['content', 'comment', 'key_text', 'meta_text']
                }
            }
        }
        # TODO: should be user settable; we should proably forbid `content` (can be huge)
        _source = [
            'key', 'version_id', 'updated', 'last_modified', 'size',
            'user_meta'
        ]
        size = 1000
    elif action == 'stats':
        body = {
            "query": {
                "match_all": {}
            },
            "aggs": {
                "totalBytes": {
                    "sum": {
                        "field": 'size'
                    }
                },
                "exts": {
                    "terms": {
                        "field": 'ext'
                    },
                    "aggs": {
                        "size": {
                            "sum": {
                                "field": 'size'
                            }
                        }
                    },
                },
            }
        }
        size = 0  # We still get all aggregates, just don't need the results
        _source = []
        # Consider all documents when computing counts, etc.
        terminate_after = None
    elif action == 'images':
        body = {
            'query': {
                'terms': {
                    'ext': IMG_EXTS
                }
            },
            'collapse': {
                'field': 'key',
                'inner_hits': {
                    'name': 'latest',
                    'size': 1,
                    'sort': [{
                        'last_modified': 'desc'
                    }],
                    '_source': ['key', 'version_id'],
                },
            },
        }
        size = NUM_PREVIEW_IMAGES
        _source = []
    elif action == 'sample':
        body = {
            'query': {
                'bool': {
                    'must': [{
                        'terms': {
                            'ext': SAMPLE_EXTS
                        }
                    }],
                    'must_not': [
                        {
                            'terms': {
                                'key': README_KEYS + [SUMMARIZE_KEY]
                            }
                        },
                        {
                            'wildcard': {
                                'key': '*/' + SUMMARIZE_KEY
                            }
                        },
                    ],
                },
            },
            'collapse': {
                'field': 'key',
                'inner_hits': {
                    'name': 'latest',
                    'size': 1,
                    'sort': [{
                        'last_modified': 'desc'
                    }],
                    '_source': ['key', 'version_id'],
                },
            },
        }
        size = NUM_PREVIEW_FILES
        _source = []
    else:
        return make_json_response(400, {"title": "Invalid action"})

    es_host = os.environ['ES_HOST']
    region = os.environ['AWS_REGION']
    index_overrides = os.getenv('INDEX_OVERRIDES', '')

    auth = BotoAWSRequestsAuth(aws_host=es_host,
                               aws_region=region,
                               aws_service='es')

    es_client = Elasticsearch(hosts=[{
        'host': es_host,
        'port': 443
    }],
                              http_auth=auth,
                              use_ssl=True,
                              verify_certs=True,
                              connection_class=RequestsHttpConnection)

    to_search = f"{indexes},{index_overrides}" if index_overrides else indexes
    result = es_client.search(to_search,
                              body,
                              _source=_source,
                              size=size,
                              terminate_after=terminate_after,
                              timeout=MAX_QUERY_DURATION)

    return make_json_response(200, post_process(result, action))
示例#14
0
def lambda_handler(request):
    """
    Generate previews for videos in S3
    """
    url = request.args['url']
    format = request.args['format']

    def _parse_param(name, default_value, min_value, max_value):
        value_str = request.args.get(name)
        if value_str is None:
            return default_value

        try:
            value = type(default_value)(value_str)
            if not min_value <= value <= max_value:
                raise ValueError
            return value
        except ValueError:
            raise ValueError(
                f"Invalid {name!r}; must be between {min_value} and {max_value} inclusive"
            )

    try:
        width = _parse_param('width', 320, 10, 640)
        height = _parse_param('height', 240, 10, 480)
        duration = _parse_param('duration', 5.0, 0.1, 30)
        audio_bitrate = _parse_param('audio_bitrate', 128, 64, 320)
        file_size = _parse_param('file_size', MAX_FILE_SIZE, 1024,
                                 MAX_FILE_SIZE)
    except ValueError as ex:
        return make_json_response(400, {'error': str(ex)})

    category = format.split('/')[0]

    format_params = []
    if category == 'audio':
        format_params.extend([
            '-b:a',
            f'{audio_bitrate}k',
            '-vn',  # Drop the video stream
        ])
    elif category == 'video':
        format_params.extend([
            "-vf",
            ','.join([
                f"scale=w={width}:h={height}:force_original_aspect_ratio=decrease",
                "crop='iw-mod(iw\\,2)':'ih-mod(ih\\,2)'",
            ]),
        ])

    with tempfile.NamedTemporaryFile() as output_file:
        p = subprocess.run(
            [
                FFMPEG,
                "-t",
                str(duration),
                "-i",
                url,
                "-f",
                FORMATS[format],
                *format_params,
                "-timelimit",
                str(request.context.get_remaining_time_in_millis() // 1000 -
                    2),  # 2 seconds for padding
                "-fs",
                str(file_size),
                "-y",  # Overwrite output file
                "-v",
                "error",  # Only print errors
                output_file.name
            ],
            check=False,
            stdin=subprocess.DEVNULL,
            stderr=subprocess.PIPE)

        if p.returncode != 0:
            return make_json_response(403, {'error': p.stderr.decode()})

        data = output_file.read()

    parsed = urlparse(url)
    filename = parsed.path.rpartition('/')[-1]

    headers = {
        'Content-Type': format,
        'Title': f"Preview of {filename}",
        'Content-Disposition': f'inline; filename="{filename}"',
    }

    return 200, data, headers
示例#15
0
def lambda_handler(request):
    """
    Proxy the request to the elastic search.
    """

    action = request.args.get('action')
    user_body = request.args.get('body', {})
    user_fields = request.args.get('fields', [])
    user_indexes = request.args.get('index', "")
    user_size = request.args.get('size', DEFAULT_SIZE)
    user_source = request.args.get('_source', [])
    # 0-indexed starting position (for pagination)
    user_from = int(request.args.get('from', 0))
    user_retry = int(request.args.get('retry', 0))
    terminate_after = int(os.environ.get('MAX_DOCUMENTS_PER_SHARD', 10_000))

    if not user_indexes or not isinstance(user_indexes, str):
        raise ValueError(
            "Request must include index=<comma-separated string of indices>")

    if user_from < 0:
        raise ValueError("'from' must be a non-negative integer")

    if action == 'packages':
        query = request.args.get('query', '')
        body = user_body or {
            "query": {
                "query_string": {
                    "analyze_wildcard":
                    True,
                    "lenient":
                    True,
                    "query":
                    query,
                    # see enterprise/**/bucket.py for mappings
                    "fields":
                    user_fields or [
                        # package
                        'comment',
                        'handle',
                        'handle_text^2',
                        'metadata',
                        'tags'
                    ]
                }
            }
        }
        if not all(i.endswith('_packages') for i in user_indexes.split(',')):
            raise ValueError(
                "'packages' action searching indexes that don't end in '_packages'"
            )
        _source = user_source
        size = user_size
        terminate_after = None
    elif action == 'search':
        query = request.args.get('query', '')
        my_fields = user_fields or [
            # object
            'content',
            'comment',
            'ext',
            'key',
            'key_text',
            'meta_text',
            # package, and boost the fields
            'handle^2',
            'handle_text^2',
            'metadata^2',
            'tags^2'
        ]
        if user_retry <= 1:
            body = {
                "query": {
                    "query_string": {
                        "analyze_wildcard": True,
                        "lenient": user_retry > 0,
                        "query": query,
                        # more precise searches vs OR
                        "default_operator": "AND",
                        # see enterprise/**/bucket.py for mappings
                        "fields": my_fields
                    }
                }
            }
        else:
            body = {
                "query": {
                    "simple_query_string": {
                        "query": query,
                        "analyze_wildcard": user_retry < 3,
                        "default_operator": "AND",
                        "fields": my_fields,
                        "lenient": True,
                    }
                }
            }
        _source = user_source or [
            'key', 'version_id', 'updated', 'last_modified', 'size',
            'user_meta', 'comment', 'handle', 'hash', 'tags', 'metadata',
            'pointer_file'
        ]
        size = DEFAULT_SIZE
    elif action == 'stats':
        body = {
            "query": {
                "match_all": {}
            },
            "aggs": {
                "totalBytes": {
                    "sum": {
                        "field": 'size'
                    }
                },
                "exts": {
                    "terms": {
                        "field": 'ext'
                    },
                    "aggs": {
                        "size": {
                            "sum": {
                                "field": 'size'
                            }
                        }
                    },
                },
                "totalPackageHandles": {
                    "value_count": {
                        "field": "handle"
                    }
                },
            }
        }
        size = 0  # We still get all aggregates, just don't need the results
        _source = False
        # Consider all documents when computing counts, etc.
        terminate_after = None
    elif action == 'images':
        body = {
            'query': {
                'regexp': {
                    'ext': IMG_EXTS
                }
            },
            'collapse': {
                'field': 'key',
                'inner_hits': {
                    'name': 'latest',
                    'size': 1,
                    'sort': [{
                        'last_modified': 'desc'
                    }],
                    '_source': ['key', 'version_id'],
                },
            },
        }
        size = NUM_PREVIEW_IMAGES
        _source = False
    elif action == 'sample':
        body = {
            'query': {
                'bool': {
                    'must': [{
                        'regexp': {
                            'ext': SAMPLE_EXTS
                        }
                    }],
                    'must_not': [
                        {
                            'terms': {
                                'key': README_KEYS + [SUMMARIZE_KEY]
                            }
                        },
                        {
                            'wildcard': {
                                'key': '*/' + SUMMARIZE_KEY
                            }
                        },
                    ],
                },
            },
            'collapse': {
                'field': 'key',
                'inner_hits': {
                    'name': 'latest',
                    'size': 1,
                    'sort': [{
                        'last_modified': 'desc'
                    }],
                    '_source': ['key', 'version_id'],
                },
            },
        }
        size = NUM_PREVIEW_FILES
        _source = False
    else:
        return make_json_response(400, {"title": "Invalid action"})

    es_host = os.environ['ES_HOST']
    region = os.environ['AWS_REGION']
    index_overrides = os.getenv('INDEX_OVERRIDES', '')

    auth = BotoAWSRequestsAuth(aws_host=es_host,
                               aws_region=region,
                               aws_service='es')

    es_client = Elasticsearch(
        hosts=[{
            'host': es_host,
            'port': 443
        }],
        http_auth=auth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=MAX_QUERY_DURATION,
    )

    to_search = f"{user_indexes},{index_overrides}" if index_overrides else user_indexes
    result = es_client.search(
        index=to_search,
        body=body,
        _source=_source,
        size=size,
        from_=user_from,
        # try turning this off to consider all documents
        terminate_after=terminate_after,
    )

    return make_json_response(200, post_process(result, action))
示例#16
0
def lambda_handler(request):
    """
    Proxy the request to the elastic search.
    """
    action = request.args.get('action')
    indexes = request.args.get('index')

    if action == 'search':
        query = request.args.get('query', '')
        body = {
            "query": {
                "simple_query_string": {
                    "query": query,
                    "fields": ['content', 'comment', 'key_text', 'meta_text']
                }
            }
        }
        # TODO: should be user settable; we should proably forbid `content` (can be huge)
        _source = [
            'key', 'version_id', 'updated', 'last_modified', 'size',
            'user_meta'
        ]
        size = 1000
    elif action == 'stats':
        body = {
            "query": {
                "match_all": {}
            },
            "aggs": {
                "totalBytes": {
                    "sum": {
                        "field": 'size'
                    }
                },
                "exts": {
                    "terms": {
                        "field": 'ext'
                    },
                    "aggs": {
                        "size": {
                            "sum": {
                                "field": 'size'
                            }
                        }
                    },
                },
                "updated": {
                    "max": {
                        "field": 'updated'
                    }
                },
            }
        }
        size = 0
        _source = []
    else:
        return make_json_response(400, {"title": "Invalid action"})

    es_host = os.environ['ES_HOST']
    region = os.environ['AWS_REGION']

    auth = BotoAWSRequestsAuth(aws_host=es_host,
                               aws_region=region,
                               aws_service='es')

    es_client = Elasticsearch(hosts=[{
        'host': es_host,
        'port': 443
    }],
                              http_auth=auth,
                              use_ssl=True,
                              verify_certs=True,
                              connection_class=RequestsHttpConnection)

    to_search = f"{indexes},{INDEX_OVERRIDES}" if INDEX_OVERRIDES else indexes
    result = es_client.search(to_search,
                              body,
                              _source=_source,
                              size=size,
                              timeout=MAX_QUERY_DURATION)

    return make_json_response(200, result)
示例#17
0
def lambda_handler(request):
    """
    dynamically handle preview requests for bytes in S3
    caller must specify input_type (since there may be no file extension)

    Returns:
        JSON response
    """
    url = request.args['url']
    input_type = request.args.get('input')
    compression = request.args.get('compression')
    separator = request.args.get('sep') or ','
    exclude_output = request.args.get('exclude_output') == 'true'
    try:
        max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES))
    except ValueError as error:
        return make_json_response(400, {
            'title': 'Unexpected max_bytes= value',
            'detail': str(error)
        })

    parsed_url = urlparse(url, allow_fragments=False)
    if not (parsed_url.scheme == 'https'
            and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
            and parsed_url.username is None and parsed_url.password is None):
        return make_json_response(
            400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'})

    try:
        line_count = _str_to_line_count(
            request.args.get('line_count', str(CATALOG_LIMIT_LINES)))
    except ValueError as error:
        # format https://jsonapi.org/format/1.1/#error-objects
        return make_json_response(400, {
            'title': 'Unexpected line_count= value',
            'detail': str(error)
        })

    # stream=True saves memory almost equal to file size
    resp = requests.get(url, stream=True)
    if resp.ok:
        content_iter = resp.iter_content(CHUNK)
        if input_type == 'csv':
            html, info = extract_csv(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes), separator)
        elif input_type == 'excel':
            html, info = extract_excel(get_bytes(content_iter, compression))
        elif input_type == 'fcs':
            html, info = extract_fcs(get_bytes(content_iter, compression))
        elif input_type == 'ipynb':
            html, info = extract_ipynb(get_bytes(content_iter, compression),
                                       exclude_output)
        elif input_type == 'parquet':
            html, info = extract_parquet(get_bytes(content_iter, compression))
        elif input_type == 'vcf':
            html, info = extract_vcf(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        elif input_type in TEXT_TYPES:
            html, info = extract_txt(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        else:
            assert False, f'unexpected input_type: {input_type}'

        assert isinstance(html, str), 'expected html parameter as string'
        assert isinstance(info, dict), 'expected info metadata to be a dict'

        ret_val = {
            'info': info,
            'html': html,
        }
    else:
        ret_val = {
            'error': resp.reason,
            'text': resp.text,
        }

    return make_json_response(resp.status_code, ret_val)
示例#18
0
def lambda_handler(request):
    """
    Generate thumbnails for images in S3
    """
    # Parse request info
    url = request.args['url']
    size = SIZE_PARAMETER_MAP[request.args['size']]
    input_ = request.args.get('input', 'image')
    output = request.args.get('output', 'json')
    page = int(request.args.get('page', '1'))
    count_pages = request.args.get('countPages') == 'true'

    # Handle request
    resp = requests.get(url)
    if not resp.ok:
        # Errored, return error code
        ret_val = {
            'error': resp.reason,
            'text': resp.text,
        }
        return make_json_response(resp.status_code, ret_val)

    src_bytes = resp.content
    try:
        thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(
            imageio.get_reader(src_bytes), "PNG")
    except ValueError:
        thumbnail_format = "JPEG" if input_ in ("pdf", "pptx") else "PNG"
    if input_ == "pdf":
        info, data = handle_pdf(src=src_bytes,
                                page=page,
                                size=size[0],
                                count_pages=count_pages)
    elif input_ == "pptx":
        info, data = handle_pptx(src=src_bytes,
                                 page=page,
                                 size=size[0],
                                 count_pages=count_pages)
    else:
        # Read image data
        img = AICSImage(src_bytes)
        orig_size = list(img.reader.data.shape)
        # Generate a formatted ndarray using the image data
        # Makes some assumptions for n-dim data
        img = format_aicsimage_to_prepped(img)

        img = generate_thumbnail(img, size)

        thumbnail_size = img.size
        # Store the bytes
        thumbnail_bytes = BytesIO()
        img.save(thumbnail_bytes, thumbnail_format)
        # Get bytes data
        data = thumbnail_bytes.getvalue()
        # Create metadata object
        info = {
            'original_size': orig_size,
            'thumbnail_format': thumbnail_format,
            'thumbnail_size': thumbnail_size,
        }

    headers = {
        'Content-Type': Image.MIME[thumbnail_format],
        QUILT_INFO_HEADER: json.dumps(info)
    }
    return 200, data, headers