def test_reset_index(self): # should always come out with row numbers contiguous from zero table = pd.DataFrame([[1, 'a'], [2, 'b'], [3, 'c']]) # lose middle row, makes index non-contiguous newtab = pd.concat([table.iloc[0:0], table.iloc[1:]]) sanitize_dataframe(newtab) self.assertCountEqual((newtab.index), [0, 1])
def test_categories_to_string_allows_abnormal_index(self): class Obj: def __init__(self, value): self.value = value def __str__(self): return self.value # Slicing a DataFrame slices its Series: the category list remains # complete, even though some categories aren't used. In this example, # `table['A']` has an Obj('a') category, even though the value doesn't # appear anywhere in the dataframe. (This is because slicing creates a # numpy "view", not a copy of the original array of codes.) # # Sanitize's output shouldn't include any categories that aren't # visible. (The data in memory should not be a "view".) table = pd.DataFrame({'A': [Obj('a'), Obj('b'), 'c', 'b']}, dtype='category')[1:] sanitize_dataframe(table) expected = pd.DataFrame({'A': ['b', 'c', 'b']}, dtype='category') assert_frame_equal(table, expected) self.assertEqual( sorted(expected['A'].cat.categories.tolist()), ['b', 'c'] )
def test_load_json(self): url = 'http://test.com/the.json' self.url_pval.set_value(url) self.url_pval.save() # use a complex example with nested data fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') sfpd_json = open(fname).read() sfpd_table = pd.DataFrame( json.loads(sfpd_json, object_pairs_hook=OrderedDict) ) # OrderedDict otherwise cols get sorted sanitize_dataframe(sfpd_table) # success case with requests_mock.Mocker() as m: m.get(url, text=sfpd_json, headers={'content-type': 'application/json'}) self.press_fetch_button() response = self.get_render() self.assertEqual(response.content.decode('utf-8'), make_render_json(sfpd_table)) # malformed json should put module in error state with requests_mock.Mocker() as m: m.get(url, text="there's just no way this is json", headers={'content-type': 'application/json'}) self.press_fetch_button() self.wfmodule.refresh_from_db() self.assertEqual(self.wfmodule.status, WfModule.ERROR)
def test_nonstr_colnames(self): # #157901159: "first row is header" option gives int column name, but # Workbench requires str result = pd.DataFrame(data=[['a', 'b'], ['c', 'd']], columns=['A', 3]) sanitize_dataframe(result) expected = pd.DataFrame({'A': ['a', 'c'], '3': ['b', 'd']}) assert_frame_equal(result, expected)
def test_rename_colnames_while_converting_types(self): # when we replace a column, there must not be duplicates. In other # words: rename-duplicates must come before replace. result = pd.DataFrame(data=[['a', {'a': 'b'}], ['c', 'd']], columns=['A', 3]) sanitize_dataframe(result) expected = pd.DataFrame({'A': ['a', 'c'], '3': ["{'a': 'b'}", 'd']}) assert_frame_equal(result, expected)
def test_remove_unused_categories(self): result = pd.DataFrame( {'A': ['a', 'b']}, # extraneous value dtype=pd.api.types.CategoricalDtype(['a', 'b', 'c']) ) sanitize_dataframe(result) expected = pd.DataFrame({'A': ['a', 'b']}, dtype='category') assert_frame_equal(result, expected)
def test_mixed_to_string_allows_custom_types(self): class Obj: def __str__(self): return 'x' table = pd.DataFrame({'A': [Obj(), Obj()]}) sanitize_dataframe(table) expected = pd.DataFrame({'A': ['x', 'x']}) assert_frame_equal(table, expected)
def event(wfm, event=None, **kwargs): table = None url = wfm.get_param_string('url').strip() tablenum = wfm.get_param_integer('tablenum') - 1 # 1 based for user if tablenum < 0: wfm.set_error(_('Table number must be at least 1')) return validate = URLValidator() try: validate(url) except ValidationError: wfm.set_error(_('That doesn''t seem to be a valid URL')) return # fetching could take a while so notify clients/users that we're working on it wfm.set_busy() tables=[] try: tables = pd.read_html(url, flavor='html5lib') if len(tables) == 0: wfm.set_error(_('Did not find any <table> tags on that page.')) except ValueError as e: wfm.set_error(_('No tables found on this page')) return except HTTPError as e: # catch this first as it's a subclass of URLError if e.code == 404: wfm.set_error(_('Page not found (404)')) return else: raise e except URLError as e: wfm.set_error(_('Server not found')) # bad domain, probably return numtables = len(tables) if numtables == 0: wfm.set_error(_('There are no HTML <table> tags on this page')) return if tablenum >= numtables: if numtables == 1: wfm.set_error(_('There is only one HTML <table> tag on this page')) else: wfm.set_error(_('There are only %d HTML <table> tags on this page') % numtables) return table = tables[tablenum] sanitize_dataframe(table) # ensure all columns are simple types (e.g. nested json to strings) # Also notifies client save_fetched_table_if_changed(wfm, table, '')
def test_mixed_to_string_keeps_nan(self): # check that sanitizing a non-string column with missing data produces # empty cells, not 'nan' strings # https://www.pivotaltracker.com/story/show/154619564 result = pd.DataFrame({'A': [1.0, 'str', np.nan, '']}) # mixed sanitize_dataframe(result) assert_frame_equal( result, pd.DataFrame({'A': ['1.0', 'str', np.nan, '']}) )
def test_render(in_table, patch_json, out_table=pd.DataFrame(), out_error=''): sanitize_dataframe(in_table) result = EditCells.render(MockParams(celledits=patch_json), in_table) result = ProcessResult.coerce(result) result.sanitize_in_place() expected = ProcessResult(out_table, out_error) expected.sanitize_in_place() assert result.error == expected.error assert_frame_equal(result.dataframe, expected.dataframe)
def test_duplicate_colnames_rename_conflict(self): # check that duplicate cols are renamed, and that non-string names are # converted to string result = pd.DataFrame(data=[[1, 2, 3], [2, 3, 4], [3, 4, 5]], columns=['A', 'A_1', 'A']) sanitize_dataframe(result) expected = pd.DataFrame({ 'A': [1, 2, 3], 'A_1': [2, 3, 4], 'A_1_1': [3, 4, 5], }) assert_frame_equal(result, expected)
def test_render(in_table, patch_json, out_table=pd.DataFrame(), out_error=''): wfm = MockWfModule(patch_json) sanitize_dataframe(in_table) result = ProcessResult.coerce(EditCells.render(wfm, in_table)) result.sanitize_in_place() expected = ProcessResult(out_table, out_error) expected.sanitize_in_place() assert result.error == expected.error assert_frame_equal(result.dataframe, expected.dataframe)
def test_categories_to_string_allows_custom_category_types(self): class Obj: def __init__(self, value): self.value = value def __str__(self): return self.value table = pd.DataFrame({'A': [Obj('a'), Obj('b'), Obj('a'), 'a', 'y']}, dtype='category') sanitize_dataframe(table) expected = pd.DataFrame({'A': ['a', 'b', 'a', 'a', 'y']}, dtype='category') assert_frame_equal(table, expected)
def setUp(self): super(UploadFileViewTests, self).setUp() # log in self.wfm = load_and_add_module('uploadfile') self.factory = APIRequestFactory() # Path through chardet encoding detection with open(mock_csv_path, 'rb') as iobytes: self.csv_table = parse_bytesio(iobytes, 'text/csv', None).dataframe with open(mock_xlsx_path, 'rb') as iobytes: self.xlsx_table = parse_bytesio( iobytes, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', None).dataframe sanitize_dataframe(self.xlsx_table)
def test_store_some_random_table(self): # Use a more realistic test table with lots of data of different types # mock data wasn't finding bugs related to dict-type columns fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') with open(fname) as f: sfpd = json.load(f) self.test_table = pd.DataFrame(sfpd) sanitize_dataframe(self.test_table) so1 = StoredObject.create_table(self.wfm1, self.test_table, self.metadata) self.assertEqual(so1.metadata, self.metadata) table2 = so1.get_table() self.assertTrue(table2.equals(self.test_table))
def test_sanitize_dataframe(self): # Load a test table which has a dict column fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') sfpd_dict = json.load(open(fname)) sfpd = pd.DataFrame(sfpd_dict) sfpd_types = sfpd.apply(pd.api.types.infer_dtype, skipna=True) self.assertEqual(sfpd.columns[6], 'location') self.assertEqual(sfpd_types[6], 'mixed') sanitize_dataframe(sfpd) # should have converted mixed types (and other complex types) to string sfpd_types = sfpd.apply(pd.api.types.infer_dtype, skipna=True) self.assertEqual(sfpd_types[6], 'string')
def setUp(self): super().setUp() self.workflow = create_testdata_workflow() self.wfm1 = WfModule.objects.first() self.wfm2 = add_new_wf_module(self.workflow, ModuleVersion.objects.first(), 1) # order = 1 self.test_data = 'stored data'.encode() self.metadata = 'metadataish' # Use a more realistic test table with lots of data of different types # mock data wasn't finding bugs related to dict-type columns fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') sfpd = json.load(open(fname)) self.test_table = pd.DataFrame(sfpd) sanitize_dataframe(self.test_table)
def setUp(self): super().setUp() # Set up auth self.requests = Mock() self.requests.get = Mock( return_value=MockResponse(404, 'Test not written')) self.oauth_service = Mock() self.oauth_service.requests_or_str_error = Mock( return_value=self.requests) self.oauth_service_lookup_patch = patch.object( oauth.OAuthService, 'lookup_or_none', return_value=self.oauth_service) self.oauth_service_lookup_patch.start() # Create WfModule self.wf_module = load_and_add_module('googlesheets') self.credentials_param = get_param_by_id_name('google_credentials') self.credentials_param.value = json.dumps({ 'name': 'file', 'secret': { 'refresh_token': 'a-refresh-token' }, }) self.credentials_param.save() self.file_param = get_param_by_id_name('googlefileselect') self.file_param.value = json.dumps({ "id": "aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj", "name": "Police Data", "url": "http://example.org/police-data", "mimeType": "application/vnd.google-apps.spreadsheet", }) self.file_param.save() # our test data self.test_table = pd.read_csv(io.BytesIO(example_csv), encoding='utf-8') sanitize_dataframe(self.test_table)
def test_sanitize_dataframe(self): # Load a test table which has a dict column fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') sfpd_dict = json.load(open(fname)) sfpd = pd.DataFrame(sfpd_dict) sfpd_types = sfpd.apply(pd.api.types.infer_dtype) self.assertEqual(sfpd.columns[6], 'location') self.assertEqual(sfpd_types[6], 'mixed') sanitize_dataframe(sfpd) # should have converted mixed types (and other complex types) to string sfpd_types = sfpd.apply(pd.api.types.infer_dtype) self.assertEqual(sfpd_types[6], 'string') # used by StoredObject, will crash on complex types, which we should # not have hash_pandas_object(sfpd)
def setUp(self): super().setUp() # Set up auth self.service_patch = patch.dict( settings.PARAMETER_OAUTH_SERVICES, { 'google_credentials': { 'token_url': 'http://token-url', 'refresh_url': 'http://refresh-url', 'client_id': 'client-id', 'client_secret': 'client-secret', 'redirect_url': 'http://my-redirect-server', } }) self.service_patch.start() # Create WfModule self.wf_module = load_and_add_module('googlesheets') self.credentials_param = get_param_by_id_name('google_credentials') self.credentials_param.value = json.dumps({ 'name': 'file', 'secret': { 'refresh_token': 'a-refresh-token' }, }) self.credentials_param.save() self.file_param = get_param_by_id_name('googlefileselect') self.file_param.value = json.dumps({ "id": "aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj", "name": "Police Data", "url": "http://example.org/police-data", }) self.file_param.save() # our test data self.test_table = pd.read_csv(gdrive_file) sanitize_dataframe(self.test_table)
def upload_to_table(wf_module, uploaded_file): try: table = __parse_uploaded_file(uploaded_file) except Exception as e: wf_module.set_error(str(e), notify=True) uploaded_file.delete( ) # delete uploaded file, we probably can't ever use it return # Cut this file down to size to prevent reading in the hugest data on every render nrows = len(table) if truncate_table_if_too_big(table): error = _('File has %d rows, truncated to %d' % (nrows, settings.MAX_ROWS_PER_TABLE)) wf_module.set_error(error, notify=False) else: # start of file upload sets module busy status on client side; undo this. wf_module.set_ready(notify=False) sanitize_dataframe(table) # Save the new output, creating and switching to a new data version version_added = wf_module.store_fetched_table(table) # set new StoredObject metadata to the json response the client expects, containing filename and uuid # (see views.UploadedFile.get) new_so = StoredObject.objects.get(wf_module=wf_module, stored_at=version_added) result = [{'uuid': uploaded_file.uuid, 'name': uploaded_file.name}] new_so.metadata = json.dumps(result) new_so.save() ChangeDataVersionCommand.create(wf_module, version_added) # also notifies client # don't delete UploadedFile, so that we can reparse later or allow higher row limit or download origina, etc. return
def event(wfmodule, **kwargs): file_meta_json = wfmodule.get_param_raw('googlefileselect', 'custom') if not file_meta_json: return file_meta = json.loads(file_meta_json) sheet_id = file_meta['id'] # Ignore file_meta['url']. That's for the client's web browser, not for # an API request. if sheet_id: secret = wfmodule.get_param_secret_secret('google_credentials') new_data, error = get_spreadsheet(sheet_id, secret) if error: table = pd.DataFrame() else: try: table = pd.read_csv(io.StringIO(new_data)) error = '' except CParserError as e: table = pd.DataFrame() error = str(e) sanitize_dataframe(table) save_fetched_table_if_changed(wfmodule, table, error)
def test_lists_and_dicts(self): result = pd.DataFrame({'A': [[5, 6, 7], {'a': 'b'}]}) sanitize_dataframe(result) expected = pd.DataFrame({'A': ['[5, 6, 7]', "{'a': 'b'}"]}) assert_frame_equal(result, expected)
def event(wfm, event=None, **kwargs): table = None url = wfm.get_param_string('url').strip() validate = URLValidator() try: validate(url) except ValidationError: wfm.set_error('That doesn''t seem to be a valid URL') return # fetching could take a while so notify clients/users that we're working on it wfm.set_busy() excel_types = ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'] csv_types = ['text/csv'] json_types = ['application/json'] mimetypes = ','.join(excel_types + csv_types + json_types) try: res = requests.get(url, headers = {'accept': mimetypes}) except requests.exceptions.ConnectionError: wfm.set_error('Could not connect to server') return if res.status_code != requests.codes.ok: wfm.set_error('Error %s fetching url' % str(res.status_code)) return # get content type, ignoring charset for now content_type = res.headers.get('content-type').split(';')[0] if content_type in csv_types: try: table = pd.read_csv(io.StringIO(res.text)) except CParserError as e: wfm.set_error(str(e)) table = pd.DataFrame([{'result':res.text}]) elif content_type in excel_types: try: table = pd.read_excel(io.BytesIO(res.content)) except XLRDError as e: wfm.set_error(str(e)) return elif content_type in json_types: try: json_string = res.text table = pd.DataFrame(json.loads(json_string, object_pairs_hook=OrderedDict)) # OrderedDict otherwise cols get sorted) except ValueError as e: wfm.set_error(str(e)) table = pd.DataFrame([{'result': res.text}]) return elif content_type == "application/octet-stream" and '.xls' in url: try: table = pd.read_excel(io.BytesIO(res.content)) except XLRDError as e: wfm.set_error(str(e)) return else: wfm.set_error('Error fetching %s: unknown content type %s' % (url,content_type)) return if wfm.status != wfm.ERROR: wfm.set_ready(notify=False) # Change the data version (when new data found) only if this module set to auto update, or user triggered auto = wfm.auto_update_data or (event is not None and event.get('type') == "click") sanitize_dataframe(table) # ensure all columns are simple types (e.g. nested json to strings) # Also notifies client save_fetched_table_if_changed(wfm, table, '')
def sanitize_in_place(self): """Coerce dataframe headers to strings and values to simple types.""" sanitizedataframe.sanitize_dataframe(self.dataframe)
def test_cast_int_category_to_int(self): result = pd.DataFrame({'A': [1, 2]}, dtype='category') sanitize_dataframe(result) expected = pd.DataFrame({'A': [1, 2]}) assert_frame_equal(result, expected)
def test_cast_mixed_category_to_str(self): result = pd.DataFrame({'A': [1, '2']}, dtype='category') sanitize_dataframe(result) expected = pd.DataFrame({'A': ['1', '2']}, dtype='category') assert_frame_equal(result, expected)