def test_bad_line_count(self): """send a known bad line_count parameter""" garbage = '-1' event = self._make_event( { 'url': self.FILE_URL, 'input': 'txt', 'line_count': garbage }, {'origin': MOCK_ORIGIN}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}' body = json.loads(read_body(resp)) assert 'Unexpected line_count=' in body[ 'title'], 'Expected 400 explanation' assert 'out of range' in body['detail'], 'Expected 400 explanation' garbage = '123notint' event = self._make_event( { 'url': self.FILE_URL, 'input': 'txt', 'line_count': garbage }, {'origin': MOCK_ORIGIN}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 400, 'Expected 400 on event with line_count of 123notint' body = json.loads(read_body(resp)) assert 'Unexpected line_count=' in body[ 'title'], 'Expected 400 explanation' assert 'invalid literal' in body['detail'], 'Expected 400 explanation'
def test_tsv(self): """test returning HTML previews of TSV (via pandas)""" csv = BASE_DIR / 'avengers.tsv' responses.add(responses.GET, self.FILE_URL, body=csv.read_bytes(), status=200) event = self._make_event({ 'url': self.FILE_URL, 'input': 'csv', 'sep': '\t' }) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, f'preview failed on {csv}' body_html = body['html'] assert body_html.count('<table') == 1, 'expected one HTML table' assert body_html.count('</table>') == 1, 'expected one HTML table' assert body_html.count('<thead>') == 1, 'expected one HTML table head' assert body_html.count('</thead>') == 1, 'expected one HTML table head' assert body_html.count('<p>') == body_html.count( '</p>'), 'malformed HTML' assert '<td>Nicholas Fury, Jr., Marcus Johnson</td>' in body_html, \ 'Expected Nick to be an Avenger' assert not re.match(r'\d+ rows × \d+ columns', body_html), \ 'table dimensions should be removed' with open(BASE_DIR / 'tsv_html_response_head.txt', encoding='utf-8') as expected: head = expected.read() assert head in body_html, 'unexpected first columns'
def test_vcf_gz_partial(self): """test previewing part of a gzipped file we _should_ read 4 whole chunks and one partial one; and the preview endpoint should truncate to the last whole line """ vcf = BASE_DIR / 'example.vcf.gz' assert os.path.getsize( vcf) > 128 * 5, 'not testing partial file decode' responses.add(responses.GET, self.FILE_URL, body=vcf.read_bytes(), status=200) event = self._make_event({ 'url': self.FILE_URL, 'input': 'vcf', 'compression': 'gz' }) # test partial decode resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp[ 'statusCode'] == 200, 'preview failed on example.vcf.gz, partial decode' data = body['info']['data'] assert not data['data'], 'partial decode; did not expect any data' assert not data['header'], 'partial decode; did not expect a header' assert data['meta'][0] == '##fileformat=VCFv4.0', 'bad first meta line' assert data['meta'][-1].startswith('##FILTER=<'), 'bad last meta line' assert data['meta'][-1].endswith( 'samples have data">'), 'bad last meta line' meta = body['info']['metadata'] assert meta['variant_count'] == 0, 'expected no variants' assert not body['info']['metadata']['variants'], 'expected no variants'
def test_ipynb(self): """test sending ipynb bytes""" notebook = BASE_DIR / 'nb_1200727.ipynb' responses.add(responses.GET, self.FILE_URL, body=notebook.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb' body_html = body['html'] # neither lxml, nor py_w3c.validators.html.validator works to validate # these fragments; reasons include base64 encoded images, html entities, etc. # so we are going to trust nbconvert and just do some basic sanity checks # it is also the case that we (often) need to update nbconvert, and # HTML output changes version over version, so checking for exact HTML # is fragile assert body_html.count('<div') > 0, 'expected divs in ipynb HTML' assert body_html.count('<div') == body_html.count('</div>') assert body_html.count('<span') > 0, 'expected spans in ipynb HTML' assert body_html.count('<span') == body_html.count('</span>') # check for some strings we know should be in there assert 'SVD of Minute-Market-Data' in body_html, 'missing expected contents' assert 'Preprocessing' in body_html, 'missing expected contents' assert '<pre>['SEE', 'SE', 'SHW', 'SIG',' in body_html, \ 'Cell 3 output seems off' assert ( '<span class="n">batch_size</span><span class="o">=</span><span class="mi">100</span>' '<span class="p">') in body_html, 'Last cell output missing'
def test_tsv_as_csv(self): """test returning HTML previews of mislabeled or problematic CSVs (via pandas)""" csv = BASE_DIR / 'tsv_mixed_types.csv' responses.add(responses.GET, self.FILE_URL, body=csv.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'csv'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, f'preview failed on {csv}' body_html = body['html'] assert body_html.count('<table') == 1, 'expected one HTML table' assert body_html.count('</table>') == 1, 'expected one HTML table' assert body_html.count('<thead>') == 1, 'expected one HTML table head' assert body_html.count('</thead>') == 1, 'expected one HTML table head' assert body_html.count('<p>') == body_html.count( '</p>'), 'malformed HTML' assert '<td>Taiwan Strait, Taiwan (general), Taiwan</td>' in body_html, \ 'Missing a cell on the Taiwan Strait' assert not re.match(r'\d+ rows × \d+ columns', body_html), \ 'table dimensions should be removed' with open(BASE_DIR / 'tsv_mixed_types_html_response_head.txt', encoding='utf-8') as expected: head = expected.read() assert head in body_html, 'unexpected first columns'
def test_tsv_quote(self): """test TSV from the glue NLP dataset""" csv = BASE_DIR / 'dev.tsv' responses.add(responses.GET, self.FILE_URL, body=csv.read_bytes(), status=200) event = self._make_event({ 'url': self.FILE_URL, 'input': 'csv', 'sep': '\t' }) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, f'preview failed on {csv}' body_html = body['html'] assert "<td>While dioxin levels in the environment were up" in body_html,\ "missing expected cell" assert "<td>In Soviet times the Beatles ' music \" was cons...</td>" in body_html,\ "missing expected cell" warnings = body['info']['warnings'] assert warnings, f"expected warnings when parsing {csv}" assert warnings.count( "Skipping line") == 43, "expected to skip 43 lines"
def test_bad(self): """send a known bad event (no input query parameter)""" event = self._make_event({'url': self.FILE_URL}, {'origin': MOCK_ORIGIN}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 400, 'Expected 400 on event without "input" query param' assert resp['body'], 'Expected explanation for 400' assert resp['headers']['access-control-allow-origin'] == '*'
def test_bad_hostname(self): bad_url = 'https://example.com/foo' event = self._make_event({ 'url': bad_url, 'input': 'txt' }, {'origin': MOCK_ORIGIN}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 400, 'Expected 400 on event with a non-S3 URL' body = json.loads(read_body(resp)) assert 'S3' in body['title'], 'Expected 400 explanation'
def test_vcf(self): """test sending vcf bytes""" vcf = BASE_DIR / 'example.vcf' responses.add(responses.GET, self.FILE_URL, body=vcf.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'vcf'}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp['statusCode'] == 200, 'preview failed on example.vcf' _check_vcf(read_body(resp))
def test_no_meta_parquet(self): """test a parquet file with no meta.metadata""" no_meta_parquet = BASE_DIR / 'no_meta.parquet' responses.add(responses.GET, self.FILE_URL, body=no_meta_parquet.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
def test_fcs(self): """test fcs extraction for extended testing you can download FCS files here https://flowrepository.org/experiments/4/download_ziped_files, copy to data/fcs/ and run this unit test """ parent = BASE_DIR / "fcs" fcs_files = list(parent.glob("*.fcs")) extended = False if (set(os.path.split(f)[1] for f in fcs_files) != set( ['accuri-ao1.fcs', 'bad.fcs', '3215apc 100004.fcs'])): extended = True first = True for fcs in fcs_files: _, name = os.path.split(fcs) file_bytes = fcs.read_bytes() if first: responses.add( responses.GET, self.FILE_URL, body=file_bytes, status=200, ) first = False else: responses.replace( responses.GET, self.FILE_URL, body=file_bytes, status=200, ) event = self._make_event({'url': self.FILE_URL, 'input': 'fcs'}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}' body = json.loads(read_body(resp)) assert 'info' in body if 'warnings' not in body['info']: if not extended: assert name == 'accuri-ao1.fcs' assert body['html'].startswith('<div>') assert body['html'].endswith('</div>') assert body['info']['metadata'].keys() else: assert not body['html'] if 'metadata' not in body['info']: assert body['info']['warnings'].startswith('Unable') if not extended: assert name == 'bad.fcs' else: if not extended: assert name == '3215apc 100004.fcs'
def test_403(self): """test 403 cases, such as Glacier""" url = self.FILE_URL responses.add( responses.GET, url=url, status=403, ) event = self._make_event({'url': url, 'input': 'txt'}) response = t4_lambda_preview.lambda_handler(event, None) assert response["statusCode"] == 403 body = json.loads(response["body"]) assert "text" in body assert "error" in body
def test_txt_max_count(self, get_preview_lines): """test truncation to line_count""" responses.add(responses.GET, self.FILE_URL, body='foo', status=200) for count in (1, 44, 19): get_preview_lines.reset_mock() get_preview_lines.return_value = [] event = self._make_event({ 'url': self.FILE_URL, 'input': 'txt', 'line_count': str(count) }) resp = t4_lambda_preview.lambda_handler(event, None) assert resp['statusCode'] == 200, 'preview lambda failed' get_preview_lines.assert_called_with( ANY, None, count, t4_lambda_preview.CATALOG_LIMIT_BYTES)
def test_bad_max_bytes(self): """send a known bad max_bytes parameter""" garbage = 'gfgfgf' event = self._make_event( { 'url': self.FILE_URL, 'input': 'txt', 'max_bytes': garbage }, {'origin': MOCK_ORIGIN}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}' body = json.loads(read_body(resp)) assert 'Unexpected max_bytes=' in body[ 'title'], 'Expected 400 explanation'
def test_parquet_empty(self): """test a parquet file with columns but no rows""" parquet = BASE_DIR / 'onlycolumns-c000' responses.add(responses.GET, self.FILE_URL, body=parquet.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}' body = json.loads(read_body(resp)) assert '<th>column_a</th>' in body['html'], 'Missing column_a' assert '<th>column_k</th>' in body['html'], 'Missing column_k' assert '<th>column_z</th>' in body['html'], 'Missing column_z'
def test_excel(self): """test parsing excel files in S3""" workbook = BASE_DIR / 'sample.xlsx' responses.add(responses.GET, self.FILE_URL, body=workbook.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'excel'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview failed on sample.xlsx' body_html = body['html'] assert body_html.count('Germany') == 13, 'unexpected data contents' assert body_html.count('Enterprise') == 7, 'unexpected data contents' assert body_html.count('Midmarket') == 13, 'unexpected data contents' assert body_html.count('Canada') == 9, 'unexpected data contents'
def test_ipynb_chop(self): """test that we eliminate output cells when we're in danger of breaking Lambda's invocation limit""" notebook = BASE_DIR / 'nb_1200727.ipynb' responses.add(responses.GET, self.FILE_URL, body=notebook.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb' body_html = body['html'] # isclose bc string sizes differ, e.g. on Linux assert math.isclose(len(body_html), 18084, abs_tol=200), "Hmm, didn't chop nb_1200727.ipynb"
def test_txt_short(self): """test sending txt bytes""" txt = BASE_DIR / 'short.txt' responses.add(responses.GET, self.FILE_URL, body=txt.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'txt'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview lambda failed on short.txt' headlist = body['info']['data']['head'] assert len(headlist) == 98, 'unexpected number of lines head' assert headlist[0] == 'Line 1', 'unexpected first line in head' assert headlist[97] == 'Line 98', 'unexpected last line in head' taillist = body['info']['data']['tail'] assert not taillist, 'expected empty tail'
def test_max_bytes(self): """test max bytes""" txt = BASE_DIR / 'short.txt' responses.add(responses.GET, self.FILE_URL, body=txt.read_bytes(), status=200) event = self._make_event({ 'url': self.FILE_URL, 'input': 'txt', 'max_bytes': '3' }) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview lambda failed on short.txt' headlist = body['info']['data']['head'] assert len(headlist) == 1, 'unexpected number of lines head' assert headlist[0] == 'Line', 'unexpected first line in head'
def test_parquet_no_pandas(self): """test sending parquet bytes, but with a different metadata format""" parquet = BASE_DIR / 'parquet_no_pandas.snappy.parquet' responses.add(responses.GET, self.FILE_URL, body=parquet.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'}) resp = t4_lambda_preview.lambda_handler(event, None) assert resp[ 'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}' body = json.loads(read_body(resp)) # open file and check body return against parquet metadata pf = pq.ParquetFile(parquet) assert all(f'<th>{col}</th>' in body['html'] for col in pf.schema.names), \ 'missing a column header in the preview' assert body['html'].count('<') > 0, 'expected tags in HTML' assert body['html'].count('<') == body['html'].count('>'), \ 'unmatched HTML tags' assert set(pf.schema.names) == set(body['info']['schema']['names']), \ 'unexpected difference of columns'
def test_csv(self): """test returning HTML previews of CSV (via pandas)""" csv = BASE_DIR / 'sample.csv' responses.add(responses.GET, self.FILE_URL, body=csv.read_bytes(), status=200) event = self._make_event({'url': self.FILE_URL, 'input': 'csv'}) resp = t4_lambda_preview.lambda_handler(event, None) body = json.loads(read_body(resp)) assert resp['statusCode'] == 200, 'preview failed on sample.csv' body_html = body['html'] assert body_html.count('<table') == 1, 'expected one HTML table' assert body_html.count('</table>') == 1, 'expected one HTML table' assert body_html.count('<p>') == body_html.count( '</p>'), 'malformed HTML' assert not re.match(r'\d+ rows × \d+ columns', body_html), \ 'table dimensions should be removed' with open(BASE_DIR / 'csv_html_response_head.txt', encoding='utf-8') as expected: head = expected.read() assert head in body_html, 'unexpected first columns'