예제 #1
0
 def test_reset_index(self):
     # should always come out with row numbers contiguous from zero
     table = pd.DataFrame([[1, 'a'], [2, 'b'], [3, 'c']])
     # lose middle row, makes index non-contiguous
     newtab = pd.concat([table.iloc[0:0], table.iloc[1:]])
     sanitize_dataframe(newtab)
     self.assertCountEqual((newtab.index), [0, 1])
예제 #2
0
    def test_categories_to_string_allows_abnormal_index(self):
        class Obj:
            def __init__(self, value):
                self.value = value

            def __str__(self):
                return self.value

        # Slicing a DataFrame slices its Series: the category list remains
        # complete, even though some categories aren't used. In this example,
        # `table['A']` has an Obj('a') category, even though the value doesn't
        # appear anywhere in the dataframe. (This is because slicing creates a
        # numpy "view", not a copy of the original array of codes.)
        #
        # Sanitize's output shouldn't include any categories that aren't
        # visible. (The data in memory should not be a "view".)
        table = pd.DataFrame({'A': [Obj('a'), Obj('b'), 'c', 'b']},
                             dtype='category')[1:]
        sanitize_dataframe(table)
        expected = pd.DataFrame({'A': ['b', 'c', 'b']}, dtype='category')
        assert_frame_equal(table, expected)
        self.assertEqual(
            sorted(expected['A'].cat.categories.tolist()),
            ['b', 'c']
        )
예제 #3
0
    def test_load_json(self):
        url = 'http://test.com/the.json'
        self.url_pval.set_value(url)
        self.url_pval.save()

        # use a complex example with nested data
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        sfpd_json = open(fname).read()
        sfpd_table = pd.DataFrame(
            json.loads(sfpd_json, object_pairs_hook=OrderedDict)
        )  # OrderedDict otherwise cols get sorted
        sanitize_dataframe(sfpd_table)

        # success case
        with requests_mock.Mocker() as m:
            m.get(url,
                  text=sfpd_json,
                  headers={'content-type': 'application/json'})
            self.press_fetch_button()
            response = self.get_render()
            self.assertEqual(response.content.decode('utf-8'),
                             make_render_json(sfpd_table))

        # malformed json should put module in error state
        with requests_mock.Mocker() as m:
            m.get(url,
                  text="there's just no way this is json",
                  headers={'content-type': 'application/json'})
            self.press_fetch_button()
            self.wfmodule.refresh_from_db()
            self.assertEqual(self.wfmodule.status, WfModule.ERROR)
예제 #4
0
 def test_nonstr_colnames(self):
     # #157901159: "first row is header" option gives int column name, but
     # Workbench requires str
     result = pd.DataFrame(data=[['a', 'b'], ['c', 'd']],
                           columns=['A', 3])
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': ['a', 'c'], '3': ['b', 'd']})
     assert_frame_equal(result, expected)
예제 #5
0
 def test_rename_colnames_while_converting_types(self):
     # when we replace a column, there must not be duplicates. In other
     # words: rename-duplicates must come before replace.
     result = pd.DataFrame(data=[['a', {'a': 'b'}], ['c', 'd']],
                           columns=['A', 3])
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': ['a', 'c'], '3': ["{'a': 'b'}", 'd']})
     assert_frame_equal(result, expected)
예제 #6
0
 def test_remove_unused_categories(self):
     result = pd.DataFrame(
         {'A': ['a', 'b']},
         # extraneous value
         dtype=pd.api.types.CategoricalDtype(['a', 'b', 'c'])
     )
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': ['a', 'b']}, dtype='category')
     assert_frame_equal(result, expected)
예제 #7
0
    def test_mixed_to_string_allows_custom_types(self):
        class Obj:
            def __str__(self):
                return 'x'

        table = pd.DataFrame({'A': [Obj(), Obj()]})
        sanitize_dataframe(table)
        expected = pd.DataFrame({'A': ['x', 'x']})
        assert_frame_equal(table, expected)
예제 #8
0
    def event(wfm, event=None, **kwargs):
        table = None
        url = wfm.get_param_string('url').strip()
        tablenum = wfm.get_param_integer('tablenum') - 1  # 1 based for user

        if tablenum < 0:
            wfm.set_error(_('Table number must be at least 1'))
            return

        validate = URLValidator()
        try:
            validate(url)
        except ValidationError:
            wfm.set_error(_('That doesn''t seem to be a valid URL'))
            return

        # fetching could take a while so notify clients/users that we're working on it
        wfm.set_busy()

        tables=[]
        try:
            tables = pd.read_html(url, flavor='html5lib')
            if len(tables) == 0:
                wfm.set_error(_('Did not find any <table> tags on that page.'))

        except ValueError as e:
            wfm.set_error(_('No tables found on this page'))
            return

        except HTTPError as e: # catch this first as it's a subclass of URLError
            if e.code == 404:
                wfm.set_error(_('Page not found (404)'))
                return
            else:
                raise e
        except URLError as e:
            wfm.set_error(_('Server not found'))   # bad domain, probably
            return

        numtables = len(tables)
        if numtables == 0:
            wfm.set_error(_('There are no HTML <table> tags on this page'))
            return

        if tablenum >= numtables:
            if numtables == 1:
                wfm.set_error(_('There is only one HTML <table> tag on this page'))
            else:
                wfm.set_error(_('There are only %d HTML <table> tags on this page') % numtables)
            return

        table = tables[tablenum]

        sanitize_dataframe(table) # ensure all columns are simple types (e.g. nested json to strings)

        # Also notifies client
        save_fetched_table_if_changed(wfm, table, '')
예제 #9
0
 def test_mixed_to_string_keeps_nan(self):
     # check that sanitizing a non-string column with missing data produces
     # empty cells, not 'nan' strings
     # https://www.pivotaltracker.com/story/show/154619564
     result = pd.DataFrame({'A': [1.0, 'str', np.nan, '']})  # mixed
     sanitize_dataframe(result)
     assert_frame_equal(
         result,
         pd.DataFrame({'A': ['1.0', 'str', np.nan, '']})
     )
예제 #10
0
def test_render(in_table, patch_json, out_table=pd.DataFrame(), out_error=''):
    sanitize_dataframe(in_table)

    result = EditCells.render(MockParams(celledits=patch_json), in_table)
    result = ProcessResult.coerce(result)
    result.sanitize_in_place()

    expected = ProcessResult(out_table, out_error)
    expected.sanitize_in_place()

    assert result.error == expected.error
    assert_frame_equal(result.dataframe, expected.dataframe)
예제 #11
0
 def test_duplicate_colnames_rename_conflict(self):
     # check that duplicate cols are renamed, and that non-string names are
     # converted to string
     result = pd.DataFrame(data=[[1, 2, 3], [2, 3, 4], [3, 4, 5]],
                           columns=['A', 'A_1', 'A'])
     sanitize_dataframe(result)
     expected = pd.DataFrame({
         'A': [1, 2, 3],
         'A_1': [2, 3, 4],
         'A_1_1': [3, 4, 5],
     })
     assert_frame_equal(result, expected)
예제 #12
0
def test_render(in_table, patch_json, out_table=pd.DataFrame(),
                out_error=''):
    wfm = MockWfModule(patch_json)
    sanitize_dataframe(in_table)

    result = ProcessResult.coerce(EditCells.render(wfm, in_table))
    result.sanitize_in_place()

    expected = ProcessResult(out_table, out_error)
    expected.sanitize_in_place()

    assert result.error == expected.error
    assert_frame_equal(result.dataframe, expected.dataframe)
예제 #13
0
    def test_categories_to_string_allows_custom_category_types(self):
        class Obj:
            def __init__(self, value):
                self.value = value

            def __str__(self):
                return self.value

        table = pd.DataFrame({'A': [Obj('a'), Obj('b'), Obj('a'), 'a', 'y']},
                             dtype='category')
        sanitize_dataframe(table)
        expected = pd.DataFrame({'A': ['a', 'b', 'a', 'a', 'y']},
                                dtype='category')
        assert_frame_equal(table, expected)
예제 #14
0
    def setUp(self):
        super(UploadFileViewTests, self).setUp()  # log in
        self.wfm = load_and_add_module('uploadfile')
        self.factory = APIRequestFactory()

        # Path through chardet encoding detection
        with open(mock_csv_path, 'rb') as iobytes:
            self.csv_table = parse_bytesio(iobytes, 'text/csv', None).dataframe

        with open(mock_xlsx_path, 'rb') as iobytes:
            self.xlsx_table = parse_bytesio(
                iobytes,
                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                None).dataframe
            sanitize_dataframe(self.xlsx_table)
예제 #15
0
    def test_store_some_random_table(self):
        # Use a more realistic test table with lots of data of different types
        # mock data wasn't finding bugs related to dict-type columns
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        with open(fname) as f:
            sfpd = json.load(f)
        self.test_table = pd.DataFrame(sfpd)
        sanitize_dataframe(self.test_table)

        so1 = StoredObject.create_table(self.wfm1, self.test_table,
                                        self.metadata)
        self.assertEqual(so1.metadata, self.metadata)
        table2 = so1.get_table()
        self.assertTrue(table2.equals(self.test_table))
예제 #16
0
    def test_sanitize_dataframe(self):
        # Load a test table which has a dict column
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        sfpd_dict = json.load(open(fname))
        sfpd = pd.DataFrame(sfpd_dict)

        sfpd_types = sfpd.apply(pd.api.types.infer_dtype, skipna=True)
        self.assertEqual(sfpd.columns[6], 'location')
        self.assertEqual(sfpd_types[6], 'mixed')

        sanitize_dataframe(sfpd)

        # should have converted mixed types (and other complex types) to string
        sfpd_types = sfpd.apply(pd.api.types.infer_dtype, skipna=True)
        self.assertEqual(sfpd_types[6], 'string')
예제 #17
0
    def setUp(self):
        super().setUp()

        self.workflow = create_testdata_workflow()
        self.wfm1 = WfModule.objects.first()
        self.wfm2 = add_new_wf_module(self.workflow,
                                      ModuleVersion.objects.first(),
                                      1)  # order = 1
        self.test_data = 'stored data'.encode()
        self.metadata = 'metadataish'

        # Use a more realistic test table with lots of data of different types
        # mock data wasn't finding bugs related to dict-type columns
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        sfpd = json.load(open(fname))
        self.test_table = pd.DataFrame(sfpd)
        sanitize_dataframe(self.test_table)
예제 #18
0
    def setUp(self):
        super().setUp()

        # Set up auth
        self.requests = Mock()
        self.requests.get = Mock(
            return_value=MockResponse(404, 'Test not written'))
        self.oauth_service = Mock()
        self.oauth_service.requests_or_str_error = Mock(
            return_value=self.requests)
        self.oauth_service_lookup_patch = patch.object(
            oauth.OAuthService,
            'lookup_or_none',
            return_value=self.oauth_service)
        self.oauth_service_lookup_patch.start()

        # Create WfModule
        self.wf_module = load_and_add_module('googlesheets')
        self.credentials_param = get_param_by_id_name('google_credentials')
        self.credentials_param.value = json.dumps({
            'name': 'file',
            'secret': {
                'refresh_token': 'a-refresh-token'
            },
        })
        self.credentials_param.save()
        self.file_param = get_param_by_id_name('googlefileselect')
        self.file_param.value = json.dumps({
            "id":
            "aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj",
            "name":
            "Police Data",
            "url":
            "http://example.org/police-data",
            "mimeType":
            "application/vnd.google-apps.spreadsheet",
        })
        self.file_param.save()

        # our test data
        self.test_table = pd.read_csv(io.BytesIO(example_csv),
                                      encoding='utf-8')
        sanitize_dataframe(self.test_table)
    def test_sanitize_dataframe(self):
        # Load a test table which has a dict column
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        sfpd_dict = json.load(open(fname))
        sfpd = pd.DataFrame(sfpd_dict)

        sfpd_types = sfpd.apply(pd.api.types.infer_dtype)
        self.assertEqual(sfpd.columns[6], 'location')
        self.assertEqual(sfpd_types[6], 'mixed')

        sanitize_dataframe(sfpd)

        # should have converted mixed types (and other complex types) to string
        sfpd_types = sfpd.apply(pd.api.types.infer_dtype)
        self.assertEqual(sfpd_types[6], 'string')

        # used by StoredObject, will crash on complex types, which we should
        # not have
        hash_pandas_object(sfpd)
예제 #20
0
    def setUp(self):
        super().setUp()

        # Set up auth
        self.service_patch = patch.dict(
            settings.PARAMETER_OAUTH_SERVICES, {
                'google_credentials': {
                    'token_url': 'http://token-url',
                    'refresh_url': 'http://refresh-url',
                    'client_id': 'client-id',
                    'client_secret': 'client-secret',
                    'redirect_url': 'http://my-redirect-server',
                }
            })
        self.service_patch.start()

        # Create WfModule
        self.wf_module = load_and_add_module('googlesheets')
        self.credentials_param = get_param_by_id_name('google_credentials')
        self.credentials_param.value = json.dumps({
            'name': 'file',
            'secret': {
                'refresh_token': 'a-refresh-token'
            },
        })
        self.credentials_param.save()
        self.file_param = get_param_by_id_name('googlefileselect')
        self.file_param.value = json.dumps({
            "id":
            "aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj",
            "name":
            "Police Data",
            "url":
            "http://example.org/police-data",
        })
        self.file_param.save()

        # our test data
        self.test_table = pd.read_csv(gdrive_file)
        sanitize_dataframe(self.test_table)
예제 #21
0
def upload_to_table(wf_module, uploaded_file):
    try:
        table = __parse_uploaded_file(uploaded_file)
    except Exception as e:
        wf_module.set_error(str(e), notify=True)
        uploaded_file.delete(
        )  # delete uploaded file, we probably can't ever use it
        return

    # Cut this file down to size to prevent reading in the hugest data on every render
    nrows = len(table)
    if truncate_table_if_too_big(table):
        error = _('File has %d rows, truncated to %d' %
                  (nrows, settings.MAX_ROWS_PER_TABLE))
        wf_module.set_error(error, notify=False)
    else:
        # start of file upload sets module busy status on client side; undo this.
        wf_module.set_ready(notify=False)

    sanitize_dataframe(table)

    # Save the new output, creating and switching to a new data version
    version_added = wf_module.store_fetched_table(table)

    # set new StoredObject metadata to the json response the client expects, containing filename and uuid
    # (see views.UploadedFile.get)
    new_so = StoredObject.objects.get(wf_module=wf_module,
                                      stored_at=version_added)
    result = [{'uuid': uploaded_file.uuid, 'name': uploaded_file.name}]
    new_so.metadata = json.dumps(result)
    new_so.save()

    ChangeDataVersionCommand.create(wf_module,
                                    version_added)  # also notifies client

    # don't delete UploadedFile, so that we can reparse later or allow higher row limit or download origina, etc.
    return
예제 #22
0
    def event(wfmodule, **kwargs):
        file_meta_json = wfmodule.get_param_raw('googlefileselect', 'custom')
        if not file_meta_json: return
        file_meta = json.loads(file_meta_json)
        sheet_id = file_meta['id']
        # Ignore file_meta['url']. That's for the client's web browser, not for
        # an API request.

        if sheet_id:
            secret = wfmodule.get_param_secret_secret('google_credentials')
            new_data, error = get_spreadsheet(sheet_id, secret)

            if error:
                table = pd.DataFrame()
            else:
                try:
                    table = pd.read_csv(io.StringIO(new_data))
                    error = ''
                except CParserError as e:
                    table = pd.DataFrame()
                    error = str(e)

            sanitize_dataframe(table)
            save_fetched_table_if_changed(wfmodule, table, error)
예제 #23
0
 def test_lists_and_dicts(self):
     result = pd.DataFrame({'A': [[5, 6, 7], {'a': 'b'}]})
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': ['[5, 6, 7]', "{'a': 'b'}"]})
     assert_frame_equal(result, expected)
예제 #24
0
    def event(wfm, event=None, **kwargs):
        table = None
        url = wfm.get_param_string('url').strip()

        validate = URLValidator()
        try:
            validate(url)
        except ValidationError:
            wfm.set_error('That doesn''t seem to be a valid URL')
            return

        # fetching could take a while so notify clients/users that we're working on it
        wfm.set_busy()

        excel_types = ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
        csv_types = ['text/csv']
        json_types = ['application/json']
        mimetypes = ','.join(excel_types + csv_types + json_types)

        try:
            res = requests.get(url, headers = {'accept': mimetypes})
        except requests.exceptions.ConnectionError:
            wfm.set_error('Could not connect to server')
            return

        if res.status_code != requests.codes.ok:
            wfm.set_error('Error %s fetching url' % str(res.status_code))
            return

        # get content type, ignoring charset for now
        content_type = res.headers.get('content-type').split(';')[0]

        if content_type in csv_types:
            try:
                table = pd.read_csv(io.StringIO(res.text))
            except CParserError as e:
                wfm.set_error(str(e))
                table = pd.DataFrame([{'result':res.text}])

        elif content_type in excel_types:
            try:
                table = pd.read_excel(io.BytesIO(res.content))
            except XLRDError as e:
                wfm.set_error(str(e))
                return

        elif content_type in json_types:
            try:
                json_string = res.text

                table = pd.DataFrame(json.loads(json_string, object_pairs_hook=OrderedDict)) # OrderedDict otherwise cols get sorted)

            except ValueError as e:
                wfm.set_error(str(e))
                table = pd.DataFrame([{'result': res.text}])
                return

        elif content_type == "application/octet-stream" and '.xls' in url:
            try:
                table = pd.read_excel(io.BytesIO(res.content))
            except XLRDError as e:
                wfm.set_error(str(e))
                return

        else:
            wfm.set_error('Error fetching %s: unknown content type %s' % (url,content_type))
            return

        if wfm.status != wfm.ERROR:
            wfm.set_ready(notify=False)

            # Change the data version (when new data found) only if this module set to auto update, or user triggered
            auto = wfm.auto_update_data or (event is not None and event.get('type') == "click")

            sanitize_dataframe(table) # ensure all columns are simple types (e.g. nested json to strings)

            # Also notifies client
            save_fetched_table_if_changed(wfm, table, '')
예제 #25
0
 def sanitize_in_place(self):
     """Coerce dataframe headers to strings and values to simple types."""
     sanitizedataframe.sanitize_dataframe(self.dataframe)
예제 #26
0
 def test_cast_int_category_to_int(self):
     result = pd.DataFrame({'A': [1, 2]}, dtype='category')
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': [1, 2]})
     assert_frame_equal(result, expected)
예제 #27
0
 def test_cast_mixed_category_to_str(self):
     result = pd.DataFrame({'A': [1, '2']}, dtype='category')
     sanitize_dataframe(result)
     expected = pd.DataFrame({'A': ['1', '2']}, dtype='category')
     assert_frame_equal(result, expected)