Python extract_parquet示例，t4_lambda_shared.preview.extract_parquet Python示例

示例#1

0

显示文件

    def test_extract_parquet(self):
        file = BASE_DIR / 'amazon-reviews-1000.snappy.parquet'
        cell_value = '<td>TSD Airsoft/Paintball Full-Face Mask, Goggle Lens</td>'

        with patch('t4_lambda_shared.preview.get_available_memory') as mem_mock:
            mem_mock.return_value = 1
            with open(file, mode='rb') as parquet:
                body, info = extract_parquet(parquet)
                assert all(bracket in body for bracket in ('<', '>'))
                assert body.count('<') == body.count('>'), \
                    'expected matching HTML tags'
                assert cell_value not in body, 'only expected columns'
                assert 'skipped rows' in info['warnings']

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, as_html=True)
            assert cell_value in body, 'missing expected HTML cell'

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, skip_rows=True)
            assert 'skipped rows' in info['warnings']
            assert cell_value not in body, 'only expected columns'

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, as_html=False)
            assert all(bracket not in body for bracket in ('<', '>')), \
                'did not expect HTML'
            parquet_file = pq.ParquetFile(file)
            assert all(
                column in info['schema']['names']
                for column in parquet_file.schema.names
            )
            assert [
                parquet_file.metadata.num_rows, parquet_file.metadata.num_columns
            ] == info['shape'], 'Unexpected number of rows or columns'

示例#2

0

显示文件

def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size):
    """get the byte contents of a file"""
    if ext.endswith('.gz'):
        compression = 'gz'
        ext = ext[:-len('.gz')]
    else:
        compression = None

    content = ""
    inferred_ext = infer_extensions(key, ext)
    if inferred_ext in CONTENT_INDEX_EXTS:
        if inferred_ext == ".ipynb":
            content = trim_to_bytes(
                # we have no choice but to fetch the entire notebook, because we
                # are going to parse it
                # warning: huge notebooks could spike memory here
                get_notebook_cells(bucket,
                                   key,
                                   size,
                                   compression,
                                   etag=etag,
                                   s3_client=s3_client,
                                   version_id=version_id),
                ELASTIC_LIMIT_BYTES)
        elif inferred_ext == ".parquet":
            if size >= get_available_memory():
                print(
                    f"{bucket}/{key} too large to deserialize; skipping contents"
                )
                # at least index the key and other stats, but don't overrun memory
                # and fail indexing altogether
                return ""
            obj = retry_s3("get",
                           bucket,
                           key,
                           size,
                           etag=etag,
                           s3_client=s3_client,
                           version_id=version_id)
            body, info = extract_parquet(get_bytes(obj["Body"], compression),
                                         as_html=False,
                                         skip_rows=(inferred_ext
                                                    in SKIP_ROWS_EXTS))
            # be smart and just send column names to ES (instead of bloated full schema)
            # if this is not an HTML/catalog preview
            columns = ','.join(list(info['schema']['names']))
            content = trim_to_bytes(f"{columns}\n{body}", ELASTIC_LIMIT_BYTES)
        else:
            content = get_plain_text(bucket,
                                     key,
                                     size,
                                     compression,
                                     etag=etag,
                                     s3_client=s3_client,
                                     version_id=version_id)

    return content

示例#3

0

显示文件

文件： index.py 项目： viveklak/quilt

def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size):
    """get the byte contents of a file"""
    if ext.endswith('.gz'):
        compression = 'gz'
        ext = ext[:-len('.gz')]
    else:
        compression = None

    content = ""
    ext = infer_extensions(key, ext)
    if ext in CONTENT_INDEX_EXTS:
        if ext == ".ipynb":
            content = trim_to_bytes(
                # we have no choice but to fetch the entire notebook, because we
                # are going to parse it
                # warning: huge notebooks could spike memory here
                get_notebook_cells(
                    bucket,
                    key,
                    size,
                    compression,
                    etag=etag,
                    s3_client=s3_client,
                    version_id=version_id
                ),
                ELASTIC_LIMIT_BYTES
            )
        elif ext == ".parquet":
            obj = retry_s3(
                "get",
                bucket,
                key,
                size,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id
            )
            body, info = extract_parquet(get_bytes(obj["Body"], compression), as_html=False)
            content = trim_to_bytes(f"{str(info)}\n{body}", ELASTIC_LIMIT_BYTES)
        else:
            content = get_plain_text(
                bucket,
                key,
                size,
                compression,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id
            )

    return content

示例#4

0

显示文件

文件： index.py 项目： sdileep/quilt

def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size):
    """get the byte contents of a file"""
    if ext == '.gz':
        compression = 'gz'
        ext = pathlib.PurePosixPath(key[:-len(ext)]).suffix.lower()
    else:
        compression = None

    content = ""
    if ext in CONTENT_INDEX_EXTS:
        if ext == ".ipynb":
            content = trim_to_bytes(
                # we have no choice but to fetch the entire notebook, because we
                # are going to parse it
                # warning: huge notebooks could spike memory here
                get_notebook_cells(bucket,
                                   key,
                                   size,
                                   compression,
                                   etag=etag,
                                   s3_client=s3_client,
                                   version_id=version_id),
                ELASTIC_LIMIT_BYTES)
        elif ext == ".parquet":
            obj = retry_s3("get",
                           bucket,
                           key,
                           size,
                           etag=etag,
                           s3_client=s3_client,
                           version_id=version_id)
            content = extract_parquet(get_bytes(obj["Body"], compression),
                                      as_html=False)[0]
        else:
            content = get_plain_text(bucket,
                                     key,
                                     size,
                                     compression,
                                     etag=etag,
                                     s3_client=s3_client,
                                     version_id=version_id)

    return content

示例#5

0

显示文件

def lambda_handler(request):
    """
    dynamically handle preview requests for bytes in S3
    caller must specify input_type (since there may be no file extension)

    Returns:
        JSON response
    """
    url = request.args['url']
    input_type = request.args.get('input')
    compression = request.args.get('compression')
    separator = request.args.get('sep') or ','
    exclude_output = request.args.get('exclude_output') == 'true'
    try:
        max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES))
    except ValueError as error:
        return make_json_response(400, {
            'title': 'Unexpected max_bytes= value',
            'detail': str(error)
        })

    parsed_url = urlparse(url, allow_fragments=False)
    if not (parsed_url.scheme == 'https'
            and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
            and parsed_url.username is None and parsed_url.password is None):
        return make_json_response(
            400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'})

    try:
        line_count = _str_to_line_count(
            request.args.get('line_count', str(CATALOG_LIMIT_LINES)))
    except ValueError as error:
        # format https://jsonapi.org/format/1.1/#error-objects
        return make_json_response(400, {
            'title': 'Unexpected line_count= value',
            'detail': str(error)
        })

    # stream=True saves memory almost equal to file size
    resp = requests.get(url, stream=True)
    if resp.ok:
        content_iter = resp.iter_content(CHUNK)
        if input_type == 'csv':
            html, info = extract_csv(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes), separator)
        elif input_type == 'excel':
            html, info = extract_excel(get_bytes(content_iter, compression))
        elif input_type == 'fcs':
            html, info = extract_fcs(get_bytes(content_iter, compression))
        elif input_type == 'ipynb':
            html, info = extract_ipynb(get_bytes(content_iter, compression),
                                       exclude_output)
        elif input_type == 'parquet':
            html, info = extract_parquet(get_bytes(content_iter, compression))
        elif input_type == 'vcf':
            html, info = extract_vcf(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        elif input_type in TEXT_TYPES:
            html, info = extract_txt(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        else:
            assert False, f'unexpected input_type: {input_type}'

        assert isinstance(html, str), 'expected html parameter as string'
        assert isinstance(info, dict), 'expected info metadata to be a dict'

        ret_val = {
            'info': info,
            'html': html,
        }
    else:
        ret_val = {
            'error': resp.reason,
            'text': resp.text,
        }

    return make_json_response(resp.status_code, ret_val)

示例#6

0

显示文件

文件： index.py 项目： quiltdata/quilt

def maybe_get_contents(bucket, key, ext, *, etag, version_id, s3_client, size):
    """get the byte contents of a file if it's a target for deep indexing"""
    logger_ = get_quilt_logger()

    if ext.endswith('.gz'):
        compression = 'gz'
        ext = ext[:-len('.gz')]
    else:
        compression = None
    logger_.debug(
        "Entering maybe_get_contents (could run out of mem.) %s %s %s", bucket, key, version_id
    )
    content = ""
    inferred_ext = infer_extensions(key, ext)
    if inferred_ext in get_content_index_extensions(bucket_name=bucket):
        def _get_obj():
            return retry_s3(
                "get",
                bucket,
                key,
                size,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id,
            )

        if inferred_ext == ".fcs":
            obj = _get_obj()
            body, info = extract_fcs(get_bytes(obj["Body"], compression), as_html=False)
            # be smart and just send column names to ES (instead of bloated full schema)
            # if this is not an HTML/catalog preview
            content = trim_to_bytes(f"{body}\n{info}", get_content_index_bytes(bucket_name=bucket))
        elif inferred_ext == ".ipynb":
            content = trim_to_bytes(
                # we have no choice but to fetch the entire notebook, because we
                # are going to parse it
                # warning: huge notebooks could spike memory here
                get_notebook_cells(
                    bucket,
                    key,
                    size,
                    compression,
                    etag=etag,
                    s3_client=s3_client,
                    version_id=version_id
                ),
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext == ".parquet":
            if size >= get_available_memory():
                print(f"{bucket}/{key} too large to deserialize; skipping contents")
                # at least index the key and other stats, but don't overrun memory
                # and fail indexing altogether
                return ""
            obj = _get_obj()
            body, info = extract_parquet(
                get_bytes(obj["Body"], compression),
                as_html=False,
                skip_rows=(inferred_ext in SKIP_ROWS_EXTS),
                max_bytes=get_content_index_bytes(bucket_name=bucket),
            )
            # be smart and just send column names to ES (instead of bloated full schema)
            # if this is not an HTML/catalog preview
            columns = ','.join(list(info['schema']['names']))
            content = trim_to_bytes(f"{columns}\n{body}", get_content_index_bytes(bucket_name=bucket))
        elif inferred_ext == ".pdf":
            obj = _get_obj()
            content = trim_to_bytes(
                extract_pdf(get_bytes(obj["Body"], compression)),
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext in (".xls", ".xlsx"):
            obj = _get_obj()
            body, _ = extract_excel(get_bytes(obj["Body"], compression), as_html=False)
            content = trim_to_bytes(
                body,
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext == ".pptx":
            obj = _get_obj()
            content = extract_pptx(get_bytes(obj["Body"], compression), get_content_index_bytes(bucket_name=bucket))
        else:
            content = get_plain_text(
                bucket,
                key,
                size,
                compression,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id
            )

    return content