Пример #1
0
    def test_bad_line_count(self):
        """send a known bad line_count parameter"""
        garbage = '-1'
        event = self._make_event(
            {
                'url': self.FILE_URL,
                'input': 'txt',
                'line_count': garbage
            }, {'origin': MOCK_ORIGIN})
        resp = t4_lambda_preview.lambda_handler(event, None)
        assert resp[
            'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}'
        body = json.loads(read_body(resp))
        assert 'Unexpected line_count=' in body[
            'title'], 'Expected 400 explanation'
        assert 'out of range' in body['detail'], 'Expected 400 explanation'

        garbage = '123notint'
        event = self._make_event(
            {
                'url': self.FILE_URL,
                'input': 'txt',
                'line_count': garbage
            }, {'origin': MOCK_ORIGIN})
        resp = t4_lambda_preview.lambda_handler(event, None)
        assert resp[
            'statusCode'] == 400, 'Expected 400 on event with line_count of 123notint'
        body = json.loads(read_body(resp))
        assert 'Unexpected line_count=' in body[
            'title'], 'Expected 400 explanation'
        assert 'invalid literal' in body['detail'], 'Expected 400 explanation'
Пример #2
0
 def test_tsv(self):
     """test returning HTML previews of TSV (via pandas)"""
     csv = BASE_DIR / 'avengers.tsv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'csv',
         'sep': '\t'
     })
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, f'preview failed on {csv}'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<thead>') == 1, 'expected one HTML table head'
     assert body_html.count('</thead>') == 1, 'expected one HTML table head'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert '<td>Nicholas Fury, Jr., Marcus Johnson</td>' in body_html, \
         'Expected Nick to be an Avenger'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR / 'tsv_html_response_head.txt',
               encoding='utf-8') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'
Пример #3
0
 def test_vcf_gz_partial(self):
     """test previewing part of a gzipped file
     we _should_ read 4 whole chunks and one partial one;
     and the preview endpoint should truncate to the last whole line
     """
     vcf = BASE_DIR / 'example.vcf.gz'
     assert os.path.getsize(
         vcf) > 128 * 5, 'not testing partial file decode'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=vcf.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'vcf',
         'compression': 'gz'
     })
     # test partial decode
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp[
         'statusCode'] == 200, 'preview failed on example.vcf.gz, partial decode'
     data = body['info']['data']
     assert not data['data'], 'partial decode; did not expect any data'
     assert not data['header'], 'partial decode; did not expect a header'
     assert data['meta'][0] == '##fileformat=VCFv4.0', 'bad first meta line'
     assert data['meta'][-1].startswith('##FILTER=<'), 'bad last meta line'
     assert data['meta'][-1].endswith(
         'samples have data">'), 'bad last meta line'
     meta = body['info']['metadata']
     assert meta['variant_count'] == 0, 'expected no variants'
     assert not body['info']['metadata']['variants'], 'expected no variants'
Пример #4
0
    def test_ipynb(self):
        """test sending ipynb bytes"""
        notebook = BASE_DIR / 'nb_1200727.ipynb'
        responses.add(responses.GET,
                      self.FILE_URL,
                      body=notebook.read_bytes(),
                      status=200)
        event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'})
        resp = t4_lambda_preview.lambda_handler(event, None)
        body = json.loads(read_body(resp))
        assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb'
        body_html = body['html']

        # neither lxml, nor py_w3c.validators.html.validator works to validate
        # these fragments; reasons include base64 encoded images, html entities, etc.
        # so we are going to trust nbconvert and just do some basic sanity checks
        # it is also the case that we (often) need to update nbconvert, and
        # HTML output changes version over version, so checking for exact HTML
        # is fragile
        assert body_html.count('<div') > 0, 'expected divs in ipynb HTML'
        assert body_html.count('<div') == body_html.count('</div>')
        assert body_html.count('<span') > 0, 'expected spans in ipynb HTML'
        assert body_html.count('<span') == body_html.count('</span>')
        # check for some strings we know should be in there
        assert 'SVD of Minute-Market-Data' in body_html, 'missing expected contents'
        assert 'Preprocessing' in body_html, 'missing expected contents'
        assert '<pre>[&#39;SEE&#39;, &#39;SE&#39;, &#39;SHW&#39;, &#39;SIG&#39;,' in body_html, \
            'Cell 3 output seems off'
        assert (
            '<span class="n">batch_size</span><span class="o">=</span><span class="mi">100</span>'
            '<span class="p">') in body_html, 'Last cell output missing'
Пример #5
0
 def test_tsv_as_csv(self):
     """test returning HTML previews of mislabeled or problematic CSVs (via pandas)"""
     csv = BASE_DIR / 'tsv_mixed_types.csv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'csv'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, f'preview failed on {csv}'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<thead>') == 1, 'expected one HTML table head'
     assert body_html.count('</thead>') == 1, 'expected one HTML table head'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert '<td>Taiwan Strait, Taiwan (general), Taiwan</td>' in body_html, \
         'Missing a cell on the Taiwan Strait'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR / 'tsv_mixed_types_html_response_head.txt',
               encoding='utf-8') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'
Пример #6
0
    def test_tsv_quote(self):
        """test TSV from the glue NLP dataset"""
        csv = BASE_DIR / 'dev.tsv'
        responses.add(responses.GET,
                      self.FILE_URL,
                      body=csv.read_bytes(),
                      status=200)
        event = self._make_event({
            'url': self.FILE_URL,
            'input': 'csv',
            'sep': '\t'
        })
        resp = t4_lambda_preview.lambda_handler(event, None)
        body = json.loads(read_body(resp))
        assert resp['statusCode'] == 200, f'preview failed on {csv}'

        body_html = body['html']
        assert "<td>While dioxin levels in the environment were up" in body_html,\
            "missing expected cell"
        assert "<td>In Soviet times the Beatles ' music \" was cons...</td>" in body_html,\
            "missing expected cell"

        warnings = body['info']['warnings']
        assert warnings, f"expected warnings when parsing {csv}"
        assert warnings.count(
            "Skipping line") == 43, "expected to skip 43 lines"
Пример #7
0
 def test_bad(self):
     """send a known bad event (no input query parameter)"""
     event = self._make_event({'url': self.FILE_URL},
                              {'origin': MOCK_ORIGIN})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 400, 'Expected 400 on event without "input" query param'
     assert resp['body'], 'Expected explanation for 400'
     assert resp['headers']['access-control-allow-origin'] == '*'
Пример #8
0
 def test_bad_hostname(self):
     bad_url = 'https://example.com/foo'
     event = self._make_event({
         'url': bad_url,
         'input': 'txt'
     }, {'origin': MOCK_ORIGIN})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 400, 'Expected 400 on event with a non-S3 URL'
     body = json.loads(read_body(resp))
     assert 'S3' in body['title'], 'Expected 400 explanation'
Пример #9
0
 def test_vcf(self):
     """test sending vcf bytes"""
     vcf = BASE_DIR / 'example.vcf'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=vcf.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'vcf'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp['statusCode'] == 200, 'preview failed on example.vcf'
     _check_vcf(read_body(resp))
Пример #10
0
    def test_no_meta_parquet(self):
        """test a parquet file with no meta.metadata"""
        no_meta_parquet = BASE_DIR / 'no_meta.parquet'
        responses.add(responses.GET,
                      self.FILE_URL,
                      body=no_meta_parquet.read_bytes(),
                      status=200)
        event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
        resp = t4_lambda_preview.lambda_handler(event, None)

        assert resp[
            'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
Пример #11
0
    def test_fcs(self):
        """test fcs extraction
        for extended testing you can download FCS files here
        https://flowrepository.org/experiments/4/download_ziped_files,
        copy to data/fcs/ and run this unit test
        """
        parent = BASE_DIR / "fcs"
        fcs_files = list(parent.glob("*.fcs"))
        extended = False
        if (set(os.path.split(f)[1] for f in fcs_files) != set(
            ['accuri-ao1.fcs', 'bad.fcs', '3215apc 100004.fcs'])):
            extended = True
        first = True
        for fcs in fcs_files:
            _, name = os.path.split(fcs)
            file_bytes = fcs.read_bytes()
            if first:
                responses.add(
                    responses.GET,
                    self.FILE_URL,
                    body=file_bytes,
                    status=200,
                )
                first = False
            else:
                responses.replace(
                    responses.GET,
                    self.FILE_URL,
                    body=file_bytes,
                    status=200,
                )

            event = self._make_event({'url': self.FILE_URL, 'input': 'fcs'})
            resp = t4_lambda_preview.lambda_handler(event, None)
            assert resp[
                'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
            body = json.loads(read_body(resp))
            assert 'info' in body
            if 'warnings' not in body['info']:
                if not extended:
                    assert name == 'accuri-ao1.fcs'
                assert body['html'].startswith('<div>')
                assert body['html'].endswith('</div>')
                assert body['info']['metadata'].keys()
            else:
                assert not body['html']
                if 'metadata' not in body['info']:
                    assert body['info']['warnings'].startswith('Unable')
                    if not extended:
                        assert name == 'bad.fcs'
                else:
                    if not extended:
                        assert name == '3215apc 100004.fcs'
Пример #12
0
 def test_403(self):
     """test 403 cases, such as Glacier"""
     url = self.FILE_URL
     responses.add(
         responses.GET,
         url=url,
         status=403,
     )
     event = self._make_event({'url': url, 'input': 'txt'})
     response = t4_lambda_preview.lambda_handler(event, None)
     assert response["statusCode"] == 403
     body = json.loads(response["body"])
     assert "text" in body
     assert "error" in body
Пример #13
0
 def test_txt_max_count(self, get_preview_lines):
     """test truncation to line_count"""
     responses.add(responses.GET, self.FILE_URL, body='foo', status=200)
     for count in (1, 44, 19):
         get_preview_lines.reset_mock()
         get_preview_lines.return_value = []
         event = self._make_event({
             'url': self.FILE_URL,
             'input': 'txt',
             'line_count': str(count)
         })
         resp = t4_lambda_preview.lambda_handler(event, None)
         assert resp['statusCode'] == 200, 'preview lambda failed'
         get_preview_lines.assert_called_with(
             ANY, None, count, t4_lambda_preview.CATALOG_LIMIT_BYTES)
Пример #14
0
 def test_bad_max_bytes(self):
     """send a known bad max_bytes parameter"""
     garbage = 'gfgfgf'
     event = self._make_event(
         {
             'url': self.FILE_URL,
             'input': 'txt',
             'max_bytes': garbage
         }, {'origin': MOCK_ORIGIN})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}'
     body = json.loads(read_body(resp))
     assert 'Unexpected max_bytes=' in body[
         'title'], 'Expected 400 explanation'
Пример #15
0
 def test_parquet_empty(self):
     """test a parquet file with columns but no rows"""
     parquet = BASE_DIR / 'onlycolumns-c000'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=parquet.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
     body = json.loads(read_body(resp))
     assert '<th>column_a</th>' in body['html'], 'Missing column_a'
     assert '<th>column_k</th>' in body['html'], 'Missing column_k'
     assert '<th>column_z</th>' in body['html'], 'Missing column_z'
Пример #16
0
 def test_excel(self):
     """test parsing excel files in S3"""
     workbook = BASE_DIR / 'sample.xlsx'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=workbook.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'excel'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on sample.xlsx'
     body_html = body['html']
     assert body_html.count('Germany') == 13, 'unexpected data contents'
     assert body_html.count('Enterprise') == 7, 'unexpected data contents'
     assert body_html.count('Midmarket') == 13, 'unexpected data contents'
     assert body_html.count('Canada') == 9, 'unexpected data contents'
Пример #17
0
 def test_ipynb_chop(self):
     """test that we eliminate output cells when we're in danger of breaking
     Lambda's invocation limit"""
     notebook = BASE_DIR / 'nb_1200727.ipynb'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=notebook.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb'
     body_html = body['html']
     # isclose bc string sizes differ, e.g. on Linux
     assert math.isclose(len(body_html), 18084,
                         abs_tol=200), "Hmm, didn't chop nb_1200727.ipynb"
Пример #18
0
 def test_txt_short(self):
     """test sending txt bytes"""
     txt = BASE_DIR / 'short.txt'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=txt.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'txt'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview lambda failed on short.txt'
     headlist = body['info']['data']['head']
     assert len(headlist) == 98, 'unexpected number of lines head'
     assert headlist[0] == 'Line 1', 'unexpected first line in head'
     assert headlist[97] == 'Line 98', 'unexpected last line in head'
     taillist = body['info']['data']['tail']
     assert not taillist, 'expected empty tail'
Пример #19
0
 def test_max_bytes(self):
     """test max bytes"""
     txt = BASE_DIR / 'short.txt'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=txt.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'txt',
         'max_bytes': '3'
     })
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview lambda failed on short.txt'
     headlist = body['info']['data']['head']
     assert len(headlist) == 1, 'unexpected number of lines head'
     assert headlist[0] == 'Line', 'unexpected first line in head'
Пример #20
0
 def test_parquet_no_pandas(self):
     """test sending parquet bytes, but with a different metadata format"""
     parquet = BASE_DIR / 'parquet_no_pandas.snappy.parquet'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=parquet.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
     body = json.loads(read_body(resp))
     # open file and check body return against parquet metadata
     pf = pq.ParquetFile(parquet)
     assert all(f'<th>{col}</th>' in body['html'] for col in pf.schema.names), \
         'missing a column header in the preview'
     assert body['html'].count('<') > 0, 'expected tags in HTML'
     assert body['html'].count('<') == body['html'].count('>'), \
         'unmatched HTML tags'
     assert set(pf.schema.names) == set(body['info']['schema']['names']), \
         'unexpected difference of columns'
Пример #21
0
 def test_csv(self):
     """test returning HTML previews of CSV (via pandas)"""
     csv = BASE_DIR / 'sample.csv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'csv'})
     resp = t4_lambda_preview.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on sample.csv'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR / 'csv_html_response_head.txt',
               encoding='utf-8') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'