def test_extract_parquet(self): file = BASE_DIR / 'amazon-reviews-1000.snappy.parquet' cell_value = '<td>TSD Airsoft/Paintball Full-Face Mask, Goggle Lens</td>' with patch('t4_lambda_shared.preview.get_available_memory') as mem_mock: mem_mock.return_value = 1 with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet) assert all(bracket in body for bracket in ('<', '>')) assert body.count('<') == body.count('>'), \ 'expected matching HTML tags' assert cell_value not in body, 'only expected columns' assert 'skipped rows' in info['warnings'] with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, as_html=True) assert cell_value in body, 'missing expected HTML cell' with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, skip_rows=True) assert 'skipped rows' in info['warnings'] assert cell_value not in body, 'only expected columns' with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, as_html=False) assert all(bracket not in body for bracket in ('<', '>')), \ 'did not expect HTML' parquet_file = pq.ParquetFile(file) assert all( column in info['schema']['names'] for column in parquet_file.schema.names ) assert [ parquet_file.metadata.num_rows, parquet_file.metadata.num_columns ] == info['shape'], 'Unexpected number of rows or columns'
def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size): """get the byte contents of a file""" if ext.endswith('.gz'): compression = 'gz' ext = ext[:-len('.gz')] else: compression = None content = "" inferred_ext = infer_extensions(key, ext) if inferred_ext in CONTENT_INDEX_EXTS: if inferred_ext == ".ipynb": content = trim_to_bytes( # we have no choice but to fetch the entire notebook, because we # are going to parse it # warning: huge notebooks could spike memory here get_notebook_cells(bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id), ELASTIC_LIMIT_BYTES) elif inferred_ext == ".parquet": if size >= get_available_memory(): print( f"{bucket}/{key} too large to deserialize; skipping contents" ) # at least index the key and other stats, but don't overrun memory # and fail indexing altogether return "" obj = retry_s3("get", bucket, key, size, etag=etag, s3_client=s3_client, version_id=version_id) body, info = extract_parquet(get_bytes(obj["Body"], compression), as_html=False, skip_rows=(inferred_ext in SKIP_ROWS_EXTS)) # be smart and just send column names to ES (instead of bloated full schema) # if this is not an HTML/catalog preview columns = ','.join(list(info['schema']['names'])) content = trim_to_bytes(f"{columns}\n{body}", ELASTIC_LIMIT_BYTES) else: content = get_plain_text(bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id) return content
def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size): """get the byte contents of a file""" if ext.endswith('.gz'): compression = 'gz' ext = ext[:-len('.gz')] else: compression = None content = "" ext = infer_extensions(key, ext) if ext in CONTENT_INDEX_EXTS: if ext == ".ipynb": content = trim_to_bytes( # we have no choice but to fetch the entire notebook, because we # are going to parse it # warning: huge notebooks could spike memory here get_notebook_cells( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ), ELASTIC_LIMIT_BYTES ) elif ext == ".parquet": obj = retry_s3( "get", bucket, key, size, etag=etag, s3_client=s3_client, version_id=version_id ) body, info = extract_parquet(get_bytes(obj["Body"], compression), as_html=False) content = trim_to_bytes(f"{str(info)}\n{body}", ELASTIC_LIMIT_BYTES) else: content = get_plain_text( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ) return content
def get_contents(bucket, key, ext, *, etag, version_id, s3_client, size): """get the byte contents of a file""" if ext == '.gz': compression = 'gz' ext = pathlib.PurePosixPath(key[:-len(ext)]).suffix.lower() else: compression = None content = "" if ext in CONTENT_INDEX_EXTS: if ext == ".ipynb": content = trim_to_bytes( # we have no choice but to fetch the entire notebook, because we # are going to parse it # warning: huge notebooks could spike memory here get_notebook_cells(bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id), ELASTIC_LIMIT_BYTES) elif ext == ".parquet": obj = retry_s3("get", bucket, key, size, etag=etag, s3_client=s3_client, version_id=version_id) content = extract_parquet(get_bytes(obj["Body"], compression), as_html=False)[0] else: content = get_plain_text(bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id) return content
def lambda_handler(request): """ dynamically handle preview requests for bytes in S3 caller must specify input_type (since there may be no file extension) Returns: JSON response """ url = request.args['url'] input_type = request.args.get('input') compression = request.args.get('compression') separator = request.args.get('sep') or ',' exclude_output = request.args.get('exclude_output') == 'true' try: max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES)) except ValueError as error: return make_json_response(400, { 'title': 'Unexpected max_bytes= value', 'detail': str(error) }) parsed_url = urlparse(url, allow_fragments=False) if not (parsed_url.scheme == 'https' and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and parsed_url.username is None and parsed_url.password is None): return make_json_response( 400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'}) try: line_count = _str_to_line_count( request.args.get('line_count', str(CATALOG_LIMIT_LINES))) except ValueError as error: # format https://jsonapi.org/format/1.1/#error-objects return make_json_response(400, { 'title': 'Unexpected line_count= value', 'detail': str(error) }) # stream=True saves memory almost equal to file size resp = requests.get(url, stream=True) if resp.ok: content_iter = resp.iter_content(CHUNK) if input_type == 'csv': html, info = extract_csv( get_preview_lines(content_iter, compression, line_count, max_bytes), separator) elif input_type == 'excel': html, info = extract_excel(get_bytes(content_iter, compression)) elif input_type == 'fcs': html, info = extract_fcs(get_bytes(content_iter, compression)) elif input_type == 'ipynb': html, info = extract_ipynb(get_bytes(content_iter, compression), exclude_output) elif input_type == 'parquet': html, info = extract_parquet(get_bytes(content_iter, compression)) elif input_type == 'vcf': html, info = extract_vcf( get_preview_lines(content_iter, compression, line_count, max_bytes)) elif input_type in TEXT_TYPES: html, info = extract_txt( get_preview_lines(content_iter, compression, line_count, max_bytes)) else: assert False, f'unexpected input_type: {input_type}' assert isinstance(html, str), 'expected html parameter as string' assert isinstance(info, dict), 'expected info metadata to be a dict' ret_val = { 'info': info, 'html': html, } else: ret_val = { 'error': resp.reason, 'text': resp.text, } return make_json_response(resp.status_code, ret_val)
def maybe_get_contents(bucket, key, ext, *, etag, version_id, s3_client, size): """get the byte contents of a file if it's a target for deep indexing""" logger_ = get_quilt_logger() if ext.endswith('.gz'): compression = 'gz' ext = ext[:-len('.gz')] else: compression = None logger_.debug( "Entering maybe_get_contents (could run out of mem.) %s %s %s", bucket, key, version_id ) content = "" inferred_ext = infer_extensions(key, ext) if inferred_ext in get_content_index_extensions(bucket_name=bucket): def _get_obj(): return retry_s3( "get", bucket, key, size, etag=etag, s3_client=s3_client, version_id=version_id, ) if inferred_ext == ".fcs": obj = _get_obj() body, info = extract_fcs(get_bytes(obj["Body"], compression), as_html=False) # be smart and just send column names to ES (instead of bloated full schema) # if this is not an HTML/catalog preview content = trim_to_bytes(f"{body}\n{info}", get_content_index_bytes(bucket_name=bucket)) elif inferred_ext == ".ipynb": content = trim_to_bytes( # we have no choice but to fetch the entire notebook, because we # are going to parse it # warning: huge notebooks could spike memory here get_notebook_cells( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ), get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext == ".parquet": if size >= get_available_memory(): print(f"{bucket}/{key} too large to deserialize; skipping contents") # at least index the key and other stats, but don't overrun memory # and fail indexing altogether return "" obj = _get_obj() body, info = extract_parquet( get_bytes(obj["Body"], compression), as_html=False, skip_rows=(inferred_ext in SKIP_ROWS_EXTS), max_bytes=get_content_index_bytes(bucket_name=bucket), ) # be smart and just send column names to ES (instead of bloated full schema) # if this is not an HTML/catalog preview columns = ','.join(list(info['schema']['names'])) content = trim_to_bytes(f"{columns}\n{body}", get_content_index_bytes(bucket_name=bucket)) elif inferred_ext == ".pdf": obj = _get_obj() content = trim_to_bytes( extract_pdf(get_bytes(obj["Body"], compression)), get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext in (".xls", ".xlsx"): obj = _get_obj() body, _ = extract_excel(get_bytes(obj["Body"], compression), as_html=False) content = trim_to_bytes( body, get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext == ".pptx": obj = _get_obj() content = extract_pptx(get_bytes(obj["Body"], compression), get_content_index_bytes(bucket_name=bucket)) else: content = get_plain_text( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ) return content