def test_headers_are_trimmed_csv(self): """ Test that if the user upload a csv with columns containing heading or trailing space, the parser will trimmed it and then compare to schema. """ fields = ['What', 'When', 'Who'] dataset = self._create_dataset_from_rows([fields]) schema = dataset.schema self.assertEqual(schema.headers, fields) # upload record csv_data = [ ['What ', ' When', ' Who '], ['Something', '2018-02-01', 'me'], ] file_ = helpers.rows_to_csv_file(csv_data) client = self.custodian_1_client url = reverse('api:dataset-upload', kwargs={'pk': dataset.pk}) with open(file_) as fp: data = { 'file': fp, 'strict': True # upload in strict mode } resp = client.post(url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # verify stored data record = dataset.record_queryset.first() self.assertEqual(record.data.get('What'), 'Something') self.assertEqual(record.data.get('When'), '2018-02-01') self.assertEqual(record.data.get('Who'), 'me') # verify that the fields with space doesn't exists for f in csv_data[0]: self.assertIsNone(record.data.get(f))
def test_site_code_column_name(self): """ Test that a column named 'site_code' can be used to extract the site code """ csv_data = [[ 'site_code', 'Site Name', 'Description', 'Latitude', 'Longitude', 'Datum', 'Attribute1', 'Attribute2' ], ['C1', 'Site 1', 'Description1', -32, 116, '', 'attr11', 'attr12'], [ 'C2', 'Site 2', 'Description2', -31, 117, '', 'attr21', 'attr22' ]] csv_file = helpers.rows_to_csv_file(csv_data) project = self.project_1 client = self.custodian_1_client url = reverse('api:upload-sites', kwargs={'pk': project.pk}) self.assertEqual(0, Site.objects.filter(project=project).count()) with open(csv_file) as fp: data = {'file': fp} resp = client.post(url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) qs = Site.objects.filter(project=project) self.assertEqual(len(csv_data) - 1, qs.count()) self.assertEqual(['C1', 'C2'], [s.code for s in qs.order_by('code')])
def test_permissions(self): """ Only custodian or admin :return: """ project = self.project_1 custodian_client = self.custodian_1_client urls = [reverse('api:upload-sites', kwargs={'pk': project.pk})] access = { "forbidden": [ self.anonymous_client, self.readonly_client, self.custodian_2_client ], "allowed": [self.admin_client, custodian_client] } data = {'file': 'dsddsds'} for client in access['forbidden']: for url in urls: self.assertIn( client.post(url, data=data, format='multipart').status_code, [status.HTTP_401_UNAUTHORIZED, status.HTTP_403_FORBIDDEN]) csv_file = helpers.rows_to_csv_file([['Site Code'], ['C1']]) for client in access['allowed']: for url in urls: with open(csv_file) as fp: self.assertIn( client.post(url, data={ 'file': fp }, format='multipart').status_code, [status.HTTP_200_OK])
def test_generic_string_and_number_simple_csv(self): """ Test that the infer detect numbers and integers type """ columns = ['Name', 'Age', 'Weight', 'Comments'] rows = [ columns, ['Frederic', '56', '80.5', 'a comment'], ['Hilda', '24', '56', ''] ] client = self.data_engineer_1_client file_ = helpers.rows_to_csv_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # should be json self.assertEqual(resp.get('content-type'), 'application/json') received = resp.json() # name should be set with the file name self.assertIn('name', received) file_name = path.splitext(path.basename(fp.name))[0] self.assertEqual(file_name, received.get('name')) # type should be 'generic' self.assertIn('type', received) self.assertEqual('generic', received.get('type')) # data_package verification self.assertIn('data_package', received) self.verify_inferred_data(received) # verify schema schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) self.assertEqual(len(schema.fields), len(columns)) self.assertEqual(schema.field_names, columns) field = schema.get_field_by_name('Name') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Age') self.assertEqual(field.type, 'integer') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Weight') self.assertEqual(field.type, 'number') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Comments') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default')
def test_upload_csv_happy_path(self): csv_data = [ ['Column A', 'Column B'], ['A1', 'B1'], ['A2', 'B2'] ] file_ = helpers.rows_to_csv_file(csv_data) client = self.custodian_1_client self.assertEqual(0, self.ds.record_queryset.count()) file_name = path.basename(file_) with open(file_) as fp: data = { 'file': fp, 'strict': True # upload in strict mode } resp = client.post(self.url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # The records should be saved in order of the row qs = self.ds.record_queryset.order_by('pk') self.assertEqual(len(csv_data) - 1, qs.count()) index = 0 record = qs[index] expected_data = { 'Column A': 'A1', 'Column B': 'B1', } self.assertEqual(expected_data, record.data) # test that source_info contains the file_name and row_counter source_info = record.source_info self.assertIsNotNone(source_info) expected_info = { 'file_name': file_name, 'row': index + 2 } self.assertEqual(source_info, expected_info) index = 1 record = qs[index] expected_data = { 'Column A': 'A2', 'Column B': 'B2', } self.assertEqual(expected_data, record.data) # test that source_info contains the file_name and row_counter source_info = record.source_info self.assertIsNotNone(source_info) expected_info = { 'file_name': file_name, 'row': index + 2 } self.assertEqual(source_info, expected_info) self.assertEqual(self.project_1.record_count, len(csv_data) - 1) self.assertEqual(self.ds.record_count, len(csv_data) - 1)
def test_upload_csv_happy_path(self): csv_data = [[ 'Site Code', 'Site Name', 'Description', 'Latitude', 'Longitude', 'Datum', 'Attribute1', 'Attribute2' ], ['C1', 'Site 1', 'Description1', -32, 116, '', 'attr11', 'attr12'], [ 'C2', 'Site 2', 'Description2', -31, 117, '', 'attr21', 'attr22' ]] csv_file = helpers.rows_to_csv_file(csv_data) project = self.project_1 client = self.custodian_1_client url = reverse('api:upload-sites', kwargs={'pk': project.pk}) self.assertEqual(0, Site.objects.filter(project=project).count()) with open(csv_file) as fp: data = {'file': fp} resp = client.post(url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) qs = Site.objects.filter(project=project) self.assertEqual(len(csv_data) - 1, qs.count()) self.assertEqual(['C1', 'C2'], [s.code for s in qs.order_by('code')]) self.assertEqual(['Site 1', 'Site 2'], [s.name for s in qs.order_by('name')]) self.assertEqual( ['Description1', 'Description2'], [s.description for s in qs.order_by('description')]) # test geom and attr s = qs.filter(code='C1').first() self.assertEqual((116, -32), (s.geometry.x, s.geometry.y)) expected_attributes = { 'Latitude': '-32', 'Longitude': '116', 'Datum': '', 'Attribute1': 'attr11', 'Attribute2': 'attr12' } self.assertEqual(expected_attributes, s.attributes) self.assertEqual(project.site_count, len(csv_data) - 1)
def test_csv_with_excel_content_type(self): """ Often on Windows a csv file comes with an excel content-type (e.g: 'application/vnd.ms-excel') Test that we handle the case. """ view = InferDatasetView.as_view() columns = ['Name', 'Age', 'Weight', 'Comments'] rows = [ columns, ['Frederic', '56', '80.5', 'a comment'], ['Hilda', '24', '56', ''] ] file_ = helpers.rows_to_csv_file(rows) factory = APIRequestFactory() with open(file_, 'rb') as fp: payload = { 'file': fp, } # In order to hack the Content-Type of the multipart form data we need to use the APIRequestFactory and work # with the view directly. Can't use the classic API client. # hack the content-type of the request. data, content_type = factory._encode_data(payload, format='multipart') if six.PY3: data = data.decode('utf-8') data = data.replace('Content-Type: text/csv', 'Content-Type: application/vnd.ms-excel') if six.PY3: data = data.encode('utf-8') request = factory.generic('POST', self.url, data, content_type=content_type) user = self.data_engineer_1_user token, _ = Token.objects.get_or_create(user=user) force_authenticate(request, user=self.data_engineer_1_user, token=token) resp = view(request).render() self.assertEqual(status.HTTP_200_OK, resp.status_code) # should be json self.assertEqual(resp.get('content-type'), 'application/json') if six.PY3: content = resp.content.decode('utf-8') else: content = resp.content received = json.loads(content) # name should be set with the file name self.assertIn('name', received) file_name = path.splitext(path.basename(fp.name))[0] self.assertEqual(file_name, received.get('name')) # type should be 'generic' self.assertIn('type', received) self.assertEqual('generic', received.get('type')) # data_package verification self.assertIn('data_package', received) self.verify_inferred_data(received) # verify schema schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) self.assertEqual(len(schema.fields), len(columns)) self.assertEqual(schema.field_names, columns) field = schema.get_field_by_name('Name') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Age') self.assertEqual(field.type, 'integer') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Weight') self.assertEqual(field.type, 'number') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Comments') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default')