def test_profiler_with_dv_file(object_id): if not object_id: object_id = 'b7634453-4ae1-48aa-81fb-1dd9ee3f4b64' ds_info = DataSetInfo.objects.filter(object_id=object_id).first() print(f'(1) {ds_info.object_id}') if not ds_info: print(f'DataSetInfo not found for {object_id}') return profiler = profiler_tasks.run_profile_by_filefield(ds_info.object_id) # profiler = profiler_tasks.run_profile_by_filefield.delay(ds_info.object_id) #print('Pause 3 seconds...') #time.sleep(3) if profiler.has_error(): print(profiler.get_err_msg()) print('it worked!') return # let it blow up.... ds_info = DataSetInfo.objects.get(object_id=object_id) print(f'(2) {ds_info.object_id}') info = ds_info.data_profile_as_dict() print(info)
def add_source_file(self, dataset_info: DataSetInfo, filename: str, add_profile: bool = False) -> DataSetInfo: """Add a source file -- example... - filepath - file under dpcreator/test_data """ # File to attach: Must be in "dpcreator/test_data" # filepath = join(TEST_DATA_DIR, filename) self.assertTrue(isfile(filepath)) # Attach the file to the `dataset_info.source_file` field # django_file = File(open(filepath, 'rb')) dataset_info.source_file.save(filename, django_file) dataset_info.save() # If specified, profile the file # if add_profile is True: profile_handler = profiler_tasks.run_profile_by_filefield( dataset_info.object_id) print('profile_handler.has_error()', profile_handler.has_error()) # Shouldn't have errors if profile_handler.has_error(): print(f'!! error: {profile_handler.get_err_msg()}') self.assertTrue(profile_handler.has_error() is False) # re-retrieve it... return DataSetInfo.objects.get(object_id=dataset_info.object_id)
def profile_file(self): """ 1. Get the DataSetInfo object by id 2. Read the associated filefield 3. Parse it into a dataframe 4. Run the variable profiler on the dataframe 5. Send results back via websocket """ if self.has_error(): return prunner = run_profile_by_filefield( self.dataset_info.object_id, max_num_features=settings.PROFILER_COLUMN_LIMIT) if prunner.has_error(): user_msg = prunner.get_err_msg() self.add_err_msg(user_msg) self.send_websocket_profiler_err_msg(user_msg) return # self.data_profile = ph.get_data_profile() self.profile_variables = prunner.data_profile profile_str = json.dumps(self.profile_variables, cls=DjangoJSONEncoder, indent=4) self.send_websocket_success_msg('Profile complete!', profile_str=profile_str)
def test_46_dataset_id_is_none(self): """(46) Test using bad DatasetInfo object id of None""" msgt(self.test_46_dataset_id_is_none.__doc__) # Run the profiler with a DatasetInfo object id of None profiler = profiler_tasks.run_profile_by_filefield( None, settings.PROFILER_COLUMN_LIMIT) self.assertTrue(profiler.has_error()) self.assertTrue( dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND in profiler.get_err_msg())
def test_47_dataset_id_is_empty_string(self): """(47) Test using bad DatasetInfo object id of None""" msgt(self.test_47_dataset_id_is_empty_string.__doc__) # Run the profiler with a DatasetInfo object id of empty string profiler = profiler_tasks.run_profile_by_filefield( '', settings.PROFILER_COLUMN_LIMIT) # Should be no error an correct number of features self.assertTrue(profiler.has_error()) print(profiler.get_err_msg()) self.assertTrue(dstatic.ERR_MSG_INVALID_DATASET_INFO_OBJECT_ID in profiler.get_err_msg())
def test_45_bad_dataset_id(self): """(45) Test using bad DatasetInfo object id""" msgt(self.test_45_bad_dataset_id.__doc__) # Get a real but incorrect object_id # reg_dv = RegisteredDataverse.objects.first() self.assertIsNotNone(reg_dv) bad_dataset_object_id = reg_dv.object_id # Run the profile using the Django file field profiler = profiler_tasks.run_profile_by_filefield( bad_dataset_object_id, settings.PROFILER_COLUMN_LIMIT) # Should be no error an correct number of features self.assertTrue(profiler.has_error()) self.assertTrue( dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND in profiler.get_err_msg())
def test_30_filefield_empty(self): """(30) Test with empty file field""" msgt(self.test_30_filefield_empty.__doc__) # Retrieve DataSetInfo # dsi = DataSetInfo.objects.get(object_id=self.ds_01_object_id) self.assertEqual(dsi.depositor_setup_info.user_step, \ DepositorSetupInfo.DepositorSteps.STEP_0100_UPLOADED) # Try to profile and empty Django FileField profiler = profiler_tasks.run_profile_by_filefield(dsi.object_id) # Error! self.assertTrue(profiler.has_error()) self.assertTrue(pstatic.ERR_MSG_SOURCE_FILE_DOES_NOT_EXIST in profiler.get_err_msg()) # Retrieve the saved DataSetInfo, the DepositorSetupInfo should have a new status dsi2 = DataSetInfo.objects.get(object_id=self.ds_01_object_id) self.assertEqual( dsi2.depositor_setup_info.user_step, DepositorSetupInfo.DepositorSteps.STEP_9300_PROFILING_FAILED)
def test_downloader(dataverse_file_id=3): """Test profiler with file""" print('get object') dfi = DataverseFileInfo.objects.get(pk=dataverse_file_id) print('dfi: ', dfi) # --------------------------- # download check! # --------------------------- print('-' * 40) print('Download Info') print('-' * 40) dhandler = DataverseDownloadHandler(dfi) if dhandler.has_error(): print('error: ', dhandler.get_err_msg()) return else: print('looks good!') # --------------------------- # profile it! # --------------------------- print('-' * 40) print('Profile it') print('-' * 40) profile_handler = profiler_tasks.run_profile_by_filefield(dfi.object_id) if profile_handler.has_error(): print('error: ', profile_handler.get_err_msg()) return else: print(profile_handler.data_profile) print( json.dumps(profile_handler.data_profile, cls=DjangoJSONEncoder, indent=4)) print('profiled!')
def test_40_filefield_correct(self): """(40) Test using filefield with legit file""" msgt(self.test_40_filefield_correct.__doc__) # Retrieve DataSetInfo # dsi = DataSetInfo.objects.get(object_id=self.ds_01_object_id) self.assertEqual(dsi.depositor_setup_info.user_step, \ DepositorSetupInfo.DepositorSteps.STEP_0100_UPLOADED) # -------------------------------------------------- # Attach the file to the DataSetInfo's file field # -------------------------------------------------- filename = 'fearonLaitin.csv' filepath = join(TEST_DATA_DIR, filename) self.assertTrue(isfile(filepath)) django_file = File(open(filepath, 'rb')) dsi.source_file.save(filename, django_file) dsi.save() # Run the profile using the Django file field profiler = profiler_tasks.run_profile_by_filefield( dsi.object_id, settings.PROFILER_COLUMN_LIMIT) # Should be no error an correct number of features # print('profiler.get_err_msg()', profiler.get_err_msg()) self.assertTrue(profiler.has_error() is False) self.assertEqual(profiler.num_variables, settings.PROFILER_COLUMN_LIMIT) # Re-retrieve DataSetInfo # dsi2 = DataSetInfo.objects.get(object_id=self.ds_01_object_id) info = dsi2.data_profile_as_dict() # print('-- Profiler reads only first 20 features') self.assertTrue('variables' in info) self.assertEqual(len(info['variables'].keys()), settings.PROFILER_COLUMN_LIMIT) self.assertEqual(dsi2.depositor_setup_info.user_step, \ DepositorSetupInfo.DepositorSteps.STEP_0400_PROFILING_COMPLETE) #print('dsi2.profile_variables', dsi2.profile_variables) # self.assertEqual(len(dsi2.profile_variables['variables'].keys()), # settings.PROFILER_COLUMN_LIMIT) # self.assertEqual(dsi2.profile_variables['dataset']['variableCount'], # settings.PROFILER_COLUMN_LIMIT) self.assertEqual( dsi2.profile_variables['dataset']['variableCount'], len(dsi2.profile_variables['dataset']['variableOrder'])) # make the sure the "dataset.variableOrder" column names are in the "variables" dict # for idx, colname in dsi2.profile_variables['dataset']['variableOrder']: self.assertTrue(colname in dsi2.profile_variables['variables'])
def test_10_download_profile_success(self): """(10) Test successful download + profile""" msgt(self.test_10_download_profile_success.__doc__) dfi = DataverseFileInfo.objects.get(pk=3) self.assertTrue(not dfi.source_file) crisis_filepath = join(TEST_DATA_DIR, 'crisis.tab') print('crisis_filepath', crisis_filepath) self.assertTrue(isfile(crisis_filepath)) with open(crisis_filepath, "rb") as data_file: responses.add( responses.GET, "https://dataverse.harvard.edu/api/access/datafile/101649", body=data_file.read(), status=200, content_type="text/tab-separated-values", stream=True, ) # --------------------------- # Run the Downloader! # --------------------------- dhandler = DataverseDownloadHandler(dfi) print('dhandler.has_error()', dhandler.has_error()) self.assertTrue(dhandler.has_error() is False) print('dfi.source_file', dfi.source_file) self.assertTrue(dfi.source_file) print('>>> new_file_name', dhandler.new_file_name) self.assertEqual(dhandler.new_file_name, 'crisis.tab') # --------------------------- # Run the Profile Handler! # --------------------------- profile_handler = profiler_tasks.run_profile_by_filefield( dfi.object_id) print('profile_handler.has_error()', profile_handler.has_error()) if profile_handler.has_error(): print(profile_handler.get_err_msg()) self.assertTrue(profile_handler.has_error() is False) # data profile exists self.assertTrue(profile_handler.data_profile) # check that the count of variables matches the actual number of variables profile = profile_handler.data_profile self.assertTrue( len(profile['variables']) == profile['dataset']['variableCount']) # check for several features within the profile json_profile = json.dumps(profile_handler.data_profile, cls=DjangoJSONEncoder, indent=4) for fn in ['WARCRI', 'WARCASE', 'SCMEDIAN']: self.assertTrue(json_profile.find(fn) > -1) # Check the status on depositor_setup_info.DepositorSetupInfo # dfi2 = DataverseFileInfo.objects.get(pk=3) self.assertEqual( dfi2.depositor_setup_info.user_step, DepositorSetupInfo.DepositorSteps.STEP_0400_PROFILING_COMPLETE)