def test_reindex_complex(self): upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME) self.dataset.import_data(self.user, upload) utils.wait() # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True for c in upload.columns]) utils.wait() # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime', 'empty_column', '']) self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'datetime', 'int', 'bool', 'float', 'datetime', 'datetime', 'NoneType', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True for c in upload.columns]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_unicode_text', 'column_datetime_date', 'column_int_integer', 'column_bool_boolean', 'column_float_float', 'column_datetime_time', 'column_datetime_datetime', 'column_NoneType_empty_column', 'column_unicode_']) self.assertEqual(dataset.row_count, 5) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_bool_boolean:true')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_text:"Chicago Tribune"')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_time:[9999-12-31T04:13:01Z TO *]')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_date:[1971-01-01T00:00:00Z TO NOW]')['response']['numFound'], 1)
def test_reindex(self): self.dataset.import_data(self.user, self.upload) utils.wait() # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) utils.wait() # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer']) self.assertEqual(dataset.row_count, 4) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
def test_import_additional_xlsx_typed_columns(self): self.dataset.import_data(self.user, self.upload) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) second_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.import_data(self.user, second_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer']) self.assertEqual(dataset.row_count, 8) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
def test_delete(self): upload = utils.get_test_data_upload(self.user, self.dataset) upload_id = upload.id path = upload.get_path() self.assertEqual(os.path.isfile(path), True) solr.delete(settings.SOLR_DATA_CORE, '*:*') self.dataset.import_data(self.user, upload) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) upload = DataUpload.objects.get(id=upload_id) dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, upload) self.assertEqual(dataset.row_count, 4) upload.delete() # Ensure dataset still exists dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, None) self.assertEqual(dataset.row_count, 0) self.assertEqual(os.path.exists(path), False) with self.assertRaises(DataUpload.DoesNotExist): DataUpload.objects.get(id=upload_id) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)
def test_reindex(self): self.dataset.import_data(self.user, self.upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) task = dataset.current_task self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer']) self.assertEqual([c['min'] for c in dataset.column_schema], [1, None, None, None]) self.assertEqual([c['max'] for c in dataset.column_schema], [4, None, None, None]) self.assertEqual(dataset.row_count, 4) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
def test_reindex_complex(self): upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME) self.dataset.import_data(self.user, upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True for c in upload.columns]) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) task = dataset.current_task self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual([c['name'] for c in dataset.column_schema], ['text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime', 'empty_column', '']) self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime', None, 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True for c in upload.columns]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_unicode_text', 'column_date_date', 'column_int_integer', 'column_bool_boolean', 'column_float_float', 'column_time_time', 'column_datetime_datetime', None, 'column_unicode_']) self.assertEqual([c['min'] for c in dataset.column_schema], [None, u'1920-01-01T00:00:00', 40, None, 1.0, u'9999-12-31T00:00:00', u'1971-01-01T04:14:00', None, None]) self.assertEqual([c['max'] for c in dataset.column_schema], [None, u'1971-01-01T00:00:00', 164, None, 41800000.01, u'9999-12-31T14:57:13', u'2048-01-01T14:57:00', None, None]) self.assertEqual(dataset.row_count, 5) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_bool_boolean:true')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_text:"Chicago Tribune"')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]')['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_time_time:[9999-12-31T04:13:01Z TO *]')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_date_date:[1971-01-01T00:00:00Z TO NOW]')['response']['numFound'], 1)
def test_delete(self): upload = utils.get_test_data_upload(self.user, self.dataset) upload_id = upload.id path = upload.get_path() self.assertEqual(os.path.isfile(path), True) solr.delete(settings.SOLR_DATA_CORE, '*:*') self.dataset.import_data(self.user, upload) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) upload = DataUpload.objects.get(id=upload_id) dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, upload) self.assertEqual(dataset.row_count, 4) upload.delete() # Ensure dataset still exists dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, None) self.assertEqual(dataset.row_count, 0) self.assertEqual(os.path.exists(path), False) with self.assertRaises(DataUpload.DoesNotExist): DataUpload.objects.get(id=upload_id) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)
def test_import_additional_xlsx_typed_columns(self): self.dataset.import_data(self.user, self.upload) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) second_upload = utils.get_test_data_upload( self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.import_data(self.user, second_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [ 'column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer' ]) self.assertEqual(dataset.row_count, 8) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 2) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 2) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
def test_import_xls(self): xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) self.dataset.import_data(self.user, xls_upload) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.xls') # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None]) self.assertEqual(dataset.row_count, 4) self.assertEqual(xls_upload.imported, True) self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_import_additional_data_same_columns(self): self.dataset.import_data(self.user, self.upload) xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.import_data(self.user, xls_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) upload = DataUpload.objects.get(id=self.upload.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None]) self.assertEqual(dataset.row_count, 8) self.assertEqual(upload.imported, True) self.assertEqual(xls_upload.imported, True) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2)
def run(self, *args, **kwargs): from panda.models import SearchSubscription log = logging.getLogger(self.name) log.info('Running subscribed searches') subscriptions = SearchSubscription.objects.all() for sub in subscriptions: log.info('Running subscription: %s' % sub) since = sub.last_run.replace(microsecond=0, tzinfo=None) since = since.isoformat('T') sub.last_run = now() sub.save() solr_query = 'last_modified:[%s TO *] AND (%s)' % (since + 'Z', sub.query) if sub.dataset: solr_query += ' dataset_slug:%s' % (sub.dataset.slug) elif sub.category: dataset_slugs = sub.category.datasets.values_list('slug', flat=True) solr_query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs) response = solr.query( settings.SOLR_DATA_CORE, solr_query, offset=0, limit=0 ) count = response['response']['numFound'] log.info('Found %i new results' % count) if count: if sub.dataset: url = '#dataset/%s/search/%s/%s' % (sub.dataset.slug, sub.query_url, since) elif sub.category: url = '#search/%s/%s/%s' % (sub.category.slug, sub.query, since) else: url = '#search/all/%s/%s' % (sub.query, since) notify( sub.user, 'subscription_results', 'info', url=url, extra_context={ 'query': sub.query, 'query_url': sub.query_url, 'category': sub.category, 'related_dataset': sub.dataset, 'count': count, 'since': since } ) log.info('Finished running subscribed searches')
def test_import_excel_xlsx(self): xlsx_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME) self.dataset.import_data(self.user, xlsx_upload) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.xlsx') utils.wait() # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) xlsx_upload = DataUpload.objects.get(id=xlsx_upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual(dataset.columns, ['id', 'first_name', 'last_name', 'employer']) self.assertEqual(dataset.row_count, 4) self.assertEqual(xlsx_upload.imported, True) self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def run(self, *args, **kwargs): from panda.models import SearchSubscription log = logging.getLogger(self.name) log.info('Running subscribed searches') subscriptions = SearchSubscription.objects.all() for sub in subscriptions: log.info('Running subscription: %s' % sub) since = sub.last_run.replace(microsecond=0, tzinfo=None) since = since.isoformat('T') sub.last_run = now() sub.save() solr_query = 'last_modified:[%s TO *] AND (%s)' % (since + 'Z', sub.query) if sub.dataset: solr_query += ' dataset_slug:%s' % (sub.dataset.slug) elif sub.category: dataset_slugs = sub.category.datasets.values_list('slug', flat=True) solr_query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs) response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=0, limit=0) count = response['response']['numFound'] log.info('Found %i new results' % count) if count: if sub.dataset: url = '#dataset/%s/search/%s/%s' % (sub.dataset.slug, sub.query_url, since) elif sub.category: url = '#search/%s/%s/%s' % (sub.category.slug, sub.query, since) else: url = '#search/all/%s/%s' % (sub.query, since) notify(sub.user, 'subscription_results', 'info', url=url, extra_context={ 'query': sub.query, 'query_url': sub.query_url, 'category': sub.category, 'related_dataset': sub.dataset, 'count': count, 'since': since }) log.info('Finished running subscribed searches')
def test_import_data(self): response = self.client.get( '/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers) self.assertEqual(response.status_code, 200) body = json.loads(response.content) self.assertNotEqual(body['current_task'], None) self.assertEqual(body['current_task']['task_name'], 'panda.tasks.import.csv') # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(self.dataset.row_count, 4) self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns) self.assertEqual(self.dataset.initial_upload, self.upload) self.assertEqual(self.dataset.sample_data, self.upload.sample_data) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertEqual(task.status, 'SUCCESS') self.assertEqual(task.task_name, 'panda.tasks.import.csv') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def _count_rows(self): """ Count the number of rows currently stored in Solr for this Dataset. Useful for sanity checks. """ return solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.slug)['response']['numFound']
def test_import_data(self): response = self.client.get('/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers) utils.wait() self.assertEqual(response.status_code, 200) body = json.loads(response.content) self.assertNotEqual(body['current_task'], None) self.assertEqual(body['current_task']['task_name'], 'panda.tasks.import.csv') # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(self.dataset.row_count, 4) self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns) self.assertEqual(self.dataset.initial_upload, self.upload) self.assertEqual(self.dataset.sample_data, self.upload.sample_data) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertEqual(task.status, 'SUCCESS') self.assertEqual(task.task_name, 'panda.tasks.import.csv') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_reindex_data(self): response = self.client.get('/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers) utils.wait() response = self.client.get('/api/1.0/dataset/%s/reindex/?typed_columns=True,False,False,False' % (self.dataset.slug), **self.auth_headers) utils.wait() self.assertEqual(response.status_code, 200) # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(self.dataset.row_count, 4) self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns) self.assertEqual(self.dataset.initial_upload, self.upload) self.assertEqual(self.dataset.sample_data, self.upload.sample_data) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertEqual(task.status, 'SUCCESS') self.assertEqual(task.task_name, 'panda.tasks.reindex') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:3')['response']['numFound'], 1)
def test_import_csv_with_schema_overrides(self): overrides = { 'id': {'indexed': True, 'type': 'float'}, 'last_name': {'indexed': True}, } self.dataset.import_data(self.user, self.upload, schema_overrides=overrides) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.csv') # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) upload = DataUpload.objects.get(id=self.upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual(upload.guessed_types, ['int', 'unicode', 'unicode', 'unicode']) #NOTE: Without overrides, float value for type would be "int" (per guessed_types) and all indexed_names would be None self.assertEqual([c['type'] for c in dataset.column_schema], ['float', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_float_id', None, 'column_unicode_last_name', None]) self.assertEqual(dataset.row_count, 4) self.assertEqual(upload.imported, True) self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_delete(self): self.dataset.import_data(self.user, self.upload) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) dataset_id = self.dataset.id self.dataset.delete() with self.assertRaises(Dataset.DoesNotExist): Dataset.objects.get(id=dataset_id) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0) response = solr.query(settings.SOLR_DATASETS_CORE, 'contributors', sort='slug asc') self.assertEqual(response['response']['numFound'], 0)
def test_import_xls(self): xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) self.dataset.import_data(self.user, xls_upload) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.xls') # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None]) self.assertEqual(dataset.row_count, 4) self.assertEqual(xls_upload.imported, True) self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_reindex_data(self): response = self.client.get( '/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers) response = self.client.get( '/api/1.0/dataset/%s/reindex/?typed_columns=True,False,False,False' % (self.dataset.slug), **self.auth_headers) self.assertEqual(response.status_code, 200) # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(self.dataset.row_count, 4) self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns) self.assertEqual(self.dataset.initial_upload, self.upload) self.assertEqual(self.dataset.sample_data, self.upload.sample_data) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertEqual(task.status, 'SUCCESS') self.assertEqual(task.task_name, 'panda.tasks.reindex') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_int_id:3')['response']['numFound'], 1)
def test_import_additional_data_different_columns(self): self.dataset.import_data(self.user, self.upload) xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) xls_upload.columns = [ 'id', 'first_name', 'last_name', 'employer', 'MORE COLUMNS!' ] xls_upload.save() # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertRaises(DataImportError, self.dataset.import_data, self.user, xls_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) upload = DataUpload.objects.get(id=self.upload.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual(dataset.row_count, 4) self.assertEqual(upload.imported, True) self.assertEqual(xls_upload.imported, False) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_import_additional_data_different_columns(self): self.dataset.import_data(self.user, self.upload) utils.wait() xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) xls_upload.columns = ['id', 'first_name', 'last_name', 'employer', 'MORE COLUMNS!'] xls_upload.save() # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.assertRaises(DataImportError, self.dataset.import_data, self.user, xls_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) upload = DataUpload.objects.get(id=self.upload.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) self.assertEqual(dataset.columns, ['id', 'first_name', 'last_name', 'employer']) self.assertEqual(dataset.row_count, 4) self.assertEqual(upload.imported, True) self.assertEqual(xls_upload.imported, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
def test_import_additional_data_same_columns(self): self.dataset.import_data(self.user, self.upload) utils.wait() xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) self.dataset.import_data(self.user, xls_upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) upload = DataUpload.objects.get(id=self.upload.id) xls_upload = DataUpload.objects.get(id=xls_upload.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None]) self.assertEqual(dataset.row_count, 8) self.assertEqual(upload.imported, True) self.assertEqual(xls_upload.imported, True) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2)
def test_reindex(self): self.dataset.import_data(self.user, self.upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) task = dataset.current_task self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [ 'column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer' ]) self.assertEqual([c['min'] for c in dataset.column_schema], [1, None, None, None]) self.assertEqual([c['max'] for c in dataset.column_schema], [4, None, None, None]) self.assertEqual(dataset.row_count, 4) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 1) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 1) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
def test_change_user_reindex(self): solr.delete(settings.SOLR_DATASETS_CORE, '*:*') self.user.first_name = 'bazbarfoo' self.user.save() dataset = utils.get_test_dataset(self.user) upload = utils.get_test_data_upload(self.user, dataset) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1) old_name = dataset.creator.first_name dataset.creator.first_name = 'foobarbaz' dataset.creator.save() self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, old_name)['response']['numFound'], 0) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
def search_dataset_data(self, request, **kwargs): """ Perform a full-text search on only one dataset. See ``get_list``. """ dataset = Dataset.objects.get(slug=kwargs['dataset_slug']) query = request.GET.get('q', '') since = request.GET.get('since', None) limit = int( request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get('offset', 0)) sort = request.GET.get('sort', '_docid_ asc') if query: solr_query = 'dataset_slug:%s AND (%s)' % (dataset.slug, query) else: solr_query = 'dataset_slug:%s' % dataset.slug if since: solr_query += ' AND last_modified:[' + since + 'Z TO *]' response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=offset, sort=sort, limit=limit) dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) results = [SolrObject(d) for d in response['response']['docs']] page = PandaPaginator(request.GET, results, resource_uri=request.path_info, count=response['response']['numFound']).page() dataset_bundle.data.update(page) dataset_bundle.data['objects'] = [] for obj in results: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) dataset_bundle.data['objects'].append(bundle.data) # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) SearchLog.objects.create(user=user, dataset=dataset, query=query) return dataset_bundle
def get_row(self, external_id): """ Fetch a row from this dataset. """ response = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND external_id:%s' % (self.slug, external_id), limit=1) if len(response['response']['docs']) < 1: return None return response['response']['docs'][0]
def get_list(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get('offset', 0)) category_slug = request.GET.get('category', None) creator_email = request.GET.get('creator_email', None) query = request.GET.get('q', '') simple = True if request.GET.get('simple', 'false').lower() == 'true' else False if category_slug == settings.PANDA_UNCATEGORIZED_SLUG: category_id = settings.PANDA_UNCATEGORIZED_ID elif category_slug: category_id = Category.objects.get(slug=category_slug).id else: category_id = None if category_id is not None and query: q = 'categories:%s %s' % (category_id, query) elif category_id is not None: q = 'categories:%s' % category_id else: q = query if creator_email: datasets = Dataset.objects.filter(creator__email=creator_email) count = datasets.count() datasets = datasets[offset:offset + limit] else: response = solr.query(settings.SOLR_DATASETS_CORE, q, offset=offset, limit=limit, sort='creation_date desc') count = response['response']['numFound'] dataset_slugs = [d['slug'] for d in response['response']['docs']] datasets = Dataset.objects.filter(slug__in=dataset_slugs) paginator = PandaPaginator(request.GET, datasets, resource_uri=request.path_info, count=count) page = paginator.page() objects = [] for obj in datasets: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) # Prune attributes we don't care about if simple: bundle = self.simplify_bundle(bundle) objects.append(bundle) page['objects'] = objects return self.create_response(request, page)
def search_dataset_data(self, request, **kwargs): """ Perform a full-text search on only one dataset. See ``get_list``. """ dataset = Dataset.objects.get(slug=kwargs['dataset_slug']) query = request.GET.get('q', '') since = request.GET.get('since', None) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get('offset', 0)) if query: solr_query = 'dataset_slug:%s AND (%s)' % (dataset.slug, query) else: solr_query = 'dataset_slug:%s' % dataset.slug if since: solr_query += ' AND last_modified:[' + since + 'Z TO *]' response = solr.query( settings.SOLR_DATA_CORE, solr_query, offset=offset, limit=limit ) dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) results = [SolrObject(d) for d in response['response']['docs']] page = PandaPaginator( request.GET, results, resource_uri=request.path_info, count=response['response']['numFound'] ).page() dataset_bundle.data.update(page) dataset_bundle.data['objects'] = [] for obj in results: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) dataset_bundle.data['objects'].append(bundle.data) SearchLog.objects.create(user=request.user, dataset=dataset, query=query) return dataset_bundle
def test_reindex_with_currency(self): upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_MONEY) self.dataset.import_data(self.user, upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[False, True], column_types=['unicode', 'float']) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['product', 'price']) self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'float']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [False, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, 'column_float_price']) self.assertEqual([c['min'] for c in dataset.column_schema], [None, 39.99]) self.assertEqual([c['max'] for c in dataset.column_schema], [None, 2599.00]) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_float_price:39.99')['response']['numFound'], 2) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_float_price:[1500 TO *]')['response']['numFound'], 2) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_float_price:*')['response']['numFound'], 8)
def test_delete(self): self.dataset.import_data(self.user, self.upload) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) dataset_id = self.dataset.id self.dataset.delete() with self.assertRaises(Dataset.DoesNotExist): Dataset.objects.get(id=dataset_id) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0) response = solr.query(settings.SOLR_DATASETS_CORE, 'contributors', sort='slug asc') self.assertEqual(response['response']['numFound'], 0)
def test_reindex_with_currency(self): upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_MONEY) self.dataset.import_data(self.user, upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[False, True], column_types=['unicode', 'float']) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual([c['name'] for c in dataset.column_schema], ['product', 'price']) self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'float']) self.assertEqual([c['indexed'] for c in dataset.column_schema], [False, True]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, 'column_float_price']) self.assertEqual([c['min'] for c in dataset.column_schema], [None, 39.99]) self.assertEqual([c['max'] for c in dataset.column_schema], [None, 2599.00]) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:39.99')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:[1500 TO *]')['response']['numFound'], 2) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:*')['response']['numFound'], 8)
def search_dataset_data(self, request, **kwargs): """ Perform a full-text search on only one dataset. See ``get_list``. """ dataset = Dataset.objects.get(slug=kwargs["dataset_slug"]) query = request.GET.get("q", "") since = request.GET.get("since", None) limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get("offset", 0)) sort = request.GET.get("sort", "_docid_ asc") if query: solr_query = "dataset_slug:%s AND (%s)" % (dataset.slug, query) else: solr_query = "dataset_slug:%s" % dataset.slug if since: solr_query += " AND last_modified:[" + since + "Z TO *]" response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=offset, sort=sort, limit=limit) dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) results = [SolrObject(d) for d in response["response"]["docs"]] page = PandaPaginator( request.GET, results, resource_uri=request.path_info, count=response["response"]["numFound"] ).page() dataset_bundle.data.update(page) dataset_bundle.data["objects"] = [] for obj in results: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) dataset_bundle.data["objects"].append(bundle.data) # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) SearchLog.objects.create(user=user, dataset=dataset, query=query) return dataset_bundle
def test_add_row_typed(self): self.dataset.import_data(self.user, self.upload, 0) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) new_row =['5', 'Somebody', 'Else', 'Somewhere'] self.dataset.add_row(self.user, new_row, external_id='5') row = self.dataset.get_row('5') self.assertEqual(row['external_id'], '5') self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:5')['response']['numFound'], 1)
def test_search_stale_dataset(self): self.dataset.import_data(self.user, self.upload, 0) self.dataset.update_full_text() # Import second dataset so we can make sure both match second_dataset = Dataset.objects.create( name='Second dataset', creator=self.dataset.creator) second_dataset.import_data(self.user, self.upload, 0) second_dataset.update_full_text() # Manually delete second dataset to simulate an integrity issue from django.db import connection, transaction cursor = connection.cursor() cursor.execute("DELETE FROM panda_dataset WHERE slug='%s'" % second_dataset.slug) transaction.commit_unless_managed() # Verify Solr data still exists self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.dataset.slug)['response']['numFound'], 4) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % second_dataset.slug)['response']['numFound'], 4) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.dataset.slug)['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % second_dataset.slug)['response']['numFound'], 1) # Execute search, which should invoke purge as a side-effect response = self.client.get('/api/1.0/data/?q=Christopher', **self.auth_headers) self.assertEqual(response.status_code, 200) # Verify Solr data has been purged self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.dataset.slug)['response']['numFound'], 4) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % second_dataset.slug)['response']['numFound'], 0) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.dataset.slug)['response']['numFound'], 1) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % second_dataset.slug)['response']['numFound'], 0) body = json.loads(response.content) # Verify that the group count is correct self.assertEqual(body['meta']['total_count'], 1) self.assertEqual(len(body['objects']), 1) # Verify that each matched dataset includes one result result_dataset = body['objects'][0] self.assertEqual(result_dataset['slug'], self.dataset.slug) self.assertEqual(result_dataset['meta']['total_count'], 1) self.assertEqual(len(result_dataset['objects']), 1)
def test_add_many_rows_typed(self): self.dataset.import_data(self.user, self.upload, 0) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh dataset so row_count is available self.dataset = Dataset.objects.get(id=self.dataset.id) new_rows = [ (['5', 'Somebody', 'Else', 'Somewhere'], 5), (['6', 'Another', 'Person', 'Somewhere'], 6) ] self.dataset.add_many_rows(self.user, new_rows) row = self.dataset.get_row('6') self.assertEqual(row['external_id'], '6') self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:[5 TO 6]')['response']['numFound'], 2)
def test_get_datum(self): self.dataset.import_data(self.user, self.upload, 0) # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) # Get id of a datum in Solr datum = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND Brian' % self.dataset.slug)['response']['docs'][0] response = self.client.get('/api/1.0/dataset/%s/data/%s/' % (self.dataset.slug, datum['external_id']), **self.auth_headers) self.assertEqual(response.status_code, 200) body = json.loads(response.content) # Verify that correct attributes of the dataset are attached self.assertEqual(body['external_id'], datum['external_id']) self.assertEqual(body['dataset'], '/api/1.0/dataset/%s/' % self.dataset.slug)
def test_add_row_typed(self): self.dataset.import_data(self.user, self.upload, 0) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh from database self.dataset = Dataset.objects.get(id=self.dataset.id) new_row = ['5', 'Somebody', 'Else', 'Somewhere'] self.dataset.add_row(self.user, new_row, external_id='5') row = self.dataset.get_row('5') self.assertEqual(row['external_id'], '5') self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_int_id:5')['response']['numFound'], 1)
def test_add_many_rows_typed(self): self.dataset.import_data(self.user, self.upload, 0) self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True]) # Refresh dataset so row_count is available self.dataset = Dataset.objects.get(id=self.dataset.id) new_rows = [(['5', 'Somebody', 'Else', 'Somewhere'], 5), (['6', 'Another', 'Person', 'Somewhere'], 6)] self.dataset.add_many_rows(self.user, new_rows) row = self.dataset.get_row('6') self.assertEqual(row['external_id'], '6') self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_int_id:[5 TO 6]')['response']['numFound'], 2)
def test_import_encoded_data(self): """ This tests for a complicated case where a UnicodeDecodeError during import could be masked by an AttrbiuteError in the return handler. """ old_sniffer_size = settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = 50 data_upload = utils.get_test_data_upload( self.user, self.dataset, utils.TEST_LATIN1_DATA_FILENAME) self.dataset.import_data(self.user, data_upload) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.csv') # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) data_upload = DataUpload.objects.get(id=data_upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual(len(dataset.column_schema), 8) self.assertEqual(dataset.row_count, None) self.assertEqual(data_upload.imported, False) self.assertEqual(task.status, 'FAILURE') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual('encoded' in task.traceback, True) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'walking')['response']['numFound'], 0) settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = old_sniffer_size
def search_dataset_data(self, request, **kwargs): """ Perform a full-text search on only one dataset. See ``get_list``. """ dataset = Dataset.objects.get(slug=kwargs["dataset_slug"]) query = request.GET.get("q", None) limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get("offset", 0)) if query: solr_query = "dataset_slug:%s AND %s" % (dataset.slug, query) else: solr_query = "dataset_slug:%s" % dataset.slug response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=offset, limit=limit) dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) results = [SolrObject(d) for d in response["response"]["docs"]] page = PandaPaginator( request.GET, results, resource_uri=request.path_info, count=response["response"]["numFound"] ).page() dataset_bundle.data.update(page) dataset_bundle.data["objects"] = [] for obj in results: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) dataset_bundle.data["objects"].append(bundle.data) return dataset_bundle
def test_get_datum(self): self.dataset.import_data(self.user, self.upload, 0) # Refetch dataset so that attributes will be updated self.dataset = Dataset.objects.get(id=self.dataset.id) # Get id of a datum in Solr datum = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND Brian' % self.dataset.slug)['response']['docs'][0] response = self.client.get( '/api/1.0/dataset/%s/data/%s/' % (self.dataset.slug, datum['external_id']), **self.auth_headers) self.assertEqual(response.status_code, 200) body = json.loads(response.content) # Verify that correct attributes of the dataset are attached self.assertEqual(body['external_id'], datum['external_id']) self.assertEqual(body['dataset'], '/api/1.0/dataset/%s/' % self.dataset.slug)
def test_import_encoded_data(self): """ This tests for a complicated case where a UnicodeDecodeError during import could be masked by an AttrbiuteError in the return handler. """ old_sniffer_size = settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = 50 data_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_LATIN1_DATA_FILENAME) self.dataset.import_data(self.user, data_upload) task = self.dataset.current_task self.assertNotEqual(task, None) self.assertNotEqual(task.id, None) self.assertEqual(task.task_name, 'panda.tasks.import.csv') # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) data_upload = DataUpload.objects.get(id=data_upload.id) task = TaskStatus.objects.get(id=task.id) self.assertEqual(len(dataset.column_schema), 8) self.assertEqual(dataset.row_count, None) self.assertEqual(data_upload.imported, False) self.assertEqual(task.status, 'FAILURE') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual('encoded' in task.traceback, True) self.assertEqual(dataset.locked, False) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'walking')['response']['numFound'], 0) settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = old_sniffer_size
def get_list(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ limit = int( request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get('offset', 0)) category_slug = request.GET.get('category', None) creator_email = request.GET.get('creator_email', None) query = request.GET.get('q', '') simple = True if request.GET.get('simple', 'false').lower() == 'true' else False if category_slug == settings.PANDA_UNCATEGORIZED_SLUG: category_id = settings.PANDA_UNCATEGORIZED_ID elif category_slug: category_id = Category.objects.get(slug=category_slug).id else: category_id = None if category_id is not None and query: q = 'categories:%s %s' % (category_id, query) elif category_id is not None: q = 'categories:%s' % category_id else: q = query if creator_email: datasets = Dataset.objects.filter(creator__email=creator_email) count = datasets.count() datasets = datasets[offset:offset + limit] else: response = solr.query(settings.SOLR_DATASETS_CORE, q, offset=offset, limit=limit, sort='creation_date desc') count = response['response']['numFound'] dataset_slugs = [d['slug'] for d in response['response']['docs']] datasets = Dataset.objects.filter(slug__in=dataset_slugs) paginator = PandaPaginator(request.GET, datasets, resource_uri=request.path_info, count=count) page = paginator.page() objects = [] for obj in datasets: bundle = self.build_bundle(obj=obj, request=request) bundle = self.full_dehydrate(bundle) # Prune attributes we don't care about if simple: bundle = self.simplify_bundle(bundle) objects.append(bundle) page['objects'] = objects return self.create_response(request, page)
def test_reindex_complex(self): upload = utils.get_test_data_upload( self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME) self.dataset.import_data(self.user, upload) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) dataset.reindex_data(self.user, typed_columns=[True for c in upload.columns]) # Refresh from database dataset = Dataset.objects.get(id=self.dataset.id) task = dataset.current_task self.assertEqual(task.status, 'SUCCESS') self.assertNotEqual(task.start, None) self.assertNotEqual(task.end, None) self.assertEqual(task.traceback, None) self.assertEqual([c['name'] for c in dataset.column_schema], [ 'text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime', 'empty_column', '' ]) self.assertEqual([c['type'] for c in dataset.column_schema], [ 'unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime', None, 'unicode' ]) self.assertEqual([c['indexed'] for c in dataset.column_schema], [True for c in upload.columns]) self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [ 'column_unicode_text', 'column_date_date', 'column_int_integer', 'column_bool_boolean', 'column_float_float', 'column_time_time', 'column_datetime_datetime', None, 'column_unicode_' ]) self.assertEqual([c['min'] for c in dataset.column_schema], [ None, u'1920-01-01T00:00:00', 40, None, 1.0, u'9999-12-31T00:00:00', u'1971-01-01T04:14:00', None, None ]) self.assertEqual([c['max'] for c in dataset.column_schema], [ None, u'1971-01-01T00:00:00', 164, None, 41800000.01, u'9999-12-31T14:57:13', u'2048-01-01T14:57:00', None, None ]) self.assertEqual(dataset.row_count, 5) self.assertEqual(dataset.locked, False) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_bool_boolean:true')['response']['numFound'], 2) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_unicode_text:"Chicago Tribune"')['response'] ['numFound'], 1) self.assertEqual( solr.query( settings.SOLR_DATA_CORE, 'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]') ['response']['numFound'], 1) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_time_time:[9999-12-31T04:13:01Z TO *]') ['response']['numFound'], 2) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'column_date_date:[1971-01-01T00:00:00Z TO NOW]') ['response']['numFound'], 1)
def test_metadata_searchable(self): response = solr.query(settings.SOLR_DATASETS_CORE, 'contributors', sort='slug asc') self.assertEqual(response['response']['numFound'], 1)
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning reindex, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return task_status = dataset.current_task task_status.begin(ugettext('Preparing to reindex')) if self.is_aborted(): task_status.abort(ugettext('Aborted during preparation')) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while i < dataset.row_count: if not read_buffer: query = 'dataset_slug: %s' % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response['response']['docs'] data = read_buffer.pop(0) row = json.loads(data['data']) new_data = utils.solr.make_data_row(dataset, row) new_data['id'] = data['id'] new_data['data_upload_id'] = data['data_upload_id'] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update( ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort( ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100)) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return dataset.column_schema = data_typer.schema dataset.save() log.info('Finished reindex, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning reindex, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return task_status = dataset.current_task task_status.begin(ugettext('Preparing to reindex')) if self.is_aborted(): task_status.abort(ugettext('Aborted during preparation')) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while i < dataset.row_count: if not read_buffer: query = 'dataset_slug: %s' % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response['response']['docs'] data = read_buffer.pop(0) row = json.loads(data['data']) new_data = utils.solr.make_data_row(dataset, row) new_data['id'] = data['id'] new_data['data_upload_id'] = data['data_upload_id'] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort(ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100)) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return dataset.column_schema = data_typer.schema dataset.save() log.info('Finished reindex, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, query, task_status_id, filename=None, *args, **kwargs): """ Execute export. """ from panda.models import Dataset, TaskStatus log = logging.getLogger(self.name) log.info('Beginning export, query: %s' % query) task_status = TaskStatus.objects.get(id=task_status_id) task_status.begin('Preparing to import') if not filename: filename = 'search_export_%s' % (now().isoformat()) zip_name = '%s.zip' % filename path = os.path.join(settings.EXPORT_ROOT, filename) zip_path = os.path.join(settings.EXPORT_ROOT, zip_name) try: os.makedirs(os.path.realpath(path)) except: pass zipfile = ZipFile(zip_path, 'w') response = solr.query_grouped(settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=0, limit=1000, group_limit=0, group_offset=0) groups = response['grouped']['dataset_slug']['groups'] datasets = {} for group in groups: dataset_slug = group['groupValue'] count = group['doclist']['numFound'] datasets[dataset_slug] = count total_n = 0 throttle = config_value('PERF', 'TASK_THROTTLE') for dataset_slug in datasets: try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Skipping part of export due to Dataset being deleted, dataset_slug: %s' % dataset_slug) continue filename = '%s.csv' % dataset_slug file_path = os.path.join(path, filename) f = open(file_path, 'w') writer = CSVKitWriter(f) # Header writer.writerow([c['name'] for c in dataset.column_schema]) response = solr.query(settings.SOLR_DATA_CORE, query, offset=0, limit=0) # Update dataset and total counts for progress tracking datasets[dataset_slug] = response['response']['numFound'] total_count = sum(datasets.values()) n = 0 while n < datasets[dataset_slug]: response = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug: %s %s' % (dataset_slug, query), offset=n, limit=SOLR_PAGE_SIZE) results = response['response']['docs'] for row in results: data = json.loads(row['data']) writer.writerow(data) task_status.update( '%.0f%% complete' % floor(float(total_n) / float(total_count) * 100)) if self.is_aborted(): task_status.abort( 'Aborted after exporting %.0f%%' % floor(float(total_n) / float(total_count) * 100)) log.warning('Export aborted, query: %s' % query) return n += SOLR_PAGE_SIZE total_n += response['response']['numFound'] time.sleep(throttle) f.close() # Add to zip and nuke temp file zipfile.write(file_path, filename) os.remove(file_path) # Finish zip file and nuke temp directory zipfile.close() os.rmdir(path) task_status.update('100% complete') log.info('Finished export, query: %s' % query) return zip_name
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info("Beginning reindex, dataset_slug: %s" % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) task_status = dataset.current_task task_status.begin("Preparing to reindex") if self.is_aborted(): task_status.abort("Aborted during preparation") log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value("PERF", "TASK_THROTTLE") i = 0 while i < dataset.row_count: if not read_buffer: query = "dataset_slug: %s" % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response["response"]["docs"] data = read_buffer.pop(0) row = json.loads(data["data"]) new_data = utils.solr.make_data_row(dataset, row) new_data["id"] = data["id"] new_data["data_upload_id"] = data["data_upload_id"] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update("%.0f%% complete" % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort( "Aborted after reindexing %.0f%%" % floor(float(i) / float(dataset.row_count) * 100) ) log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update("100% complete") # Refresh dataset dataset = Dataset.objects.get(slug=dataset_slug) dataset.column_schema = data_typer.schema dataset.save() log.info("Finished reindex, dataset_slug: %s" % dataset_slug) return data_typer
def run(self, dataset_slug, query=None, filename=None, *args, **kwargs): """ Execute export. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning export, dataset_slug:%s %s' % (dataset_slug, query)) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Export failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return task_status = dataset.current_task task_status.begin('Preparing to export') if not filename: filename = '%s_%s.csv' % (dataset_slug, datetime.datetime.utcnow().isoformat()) else: filename = '%s.csv' % filename path = os.path.join(settings.EXPORT_ROOT, filename) try: os.makedirs(os.path.realpath(os.path.dirname(path))) except: pass f = open(path, 'w') writer = CSVKitWriter(f) # Header writer.writerow([c['name'] for c in dataset.column_schema]) solr_query = 'dataset_slug:%s' % dataset_slug if query: solr_query = '%s %s' % (solr_query, query) response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=0, limit=0) total_count = response['response']['numFound'] n = 0 throttle = config_value('PERF', 'TASK_THROTTLE') while n < total_count: response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=n, limit=SOLR_PAGE_SIZE) results = response['response']['docs'] for row in results: data = json.loads(row['data']) writer.writerow(data) task_status.update('%.0f%% complete' % floor(float(n) / float(total_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after exporting %.0f%%' % floor(float(n) / float(total_count) * 100)) log.warning('Export aborted, dataset_slug: %s' % dataset_slug) return n += SOLR_PAGE_SIZE time.sleep(throttle) f.close() task_status.update('100% complete') log.info('Finished export, dataset_slug:%s %s' % (dataset_slug, query)) return filename