예제 #1
0
def test_profiler_with_dv_file(object_id):

    if not object_id:
        object_id = 'b7634453-4ae1-48aa-81fb-1dd9ee3f4b64'

    ds_info = DataSetInfo.objects.filter(object_id=object_id).first()
    print(f'(1) {ds_info.object_id}')
    if not ds_info:
        print(f'DataSetInfo not found for {object_id}')
        return

    profiler = profiler_tasks.run_profile_by_filefield(ds_info.object_id)
    # profiler = profiler_tasks.run_profile_by_filefield.delay(ds_info.object_id)

    #print('Pause 3 seconds...')
    #time.sleep(3)

    if profiler.has_error():
        print(profiler.get_err_msg())

    print('it worked!')
    return
    # let it blow up....
    ds_info = DataSetInfo.objects.get(object_id=object_id)
    print(f'(2) {ds_info.object_id}')
    info = ds_info.data_profile_as_dict()
    print(info)
    def add_source_file(self,
                        dataset_info: DataSetInfo,
                        filename: str,
                        add_profile: bool = False) -> DataSetInfo:
        """Add a source file -- example...
        - filepath - file under dpcreator/test_data
        """

        # File to attach: Must be in "dpcreator/test_data"
        #
        filepath = join(TEST_DATA_DIR, filename)
        self.assertTrue(isfile(filepath))

        # Attach the file to the  `dataset_info.source_file` field
        #
        django_file = File(open(filepath, 'rb'))
        dataset_info.source_file.save(filename, django_file)
        dataset_info.save()

        # If specified, profile the file
        #
        if add_profile is True:
            profile_handler = profiler_tasks.run_profile_by_filefield(
                dataset_info.object_id)
            print('profile_handler.has_error()', profile_handler.has_error())

            # Shouldn't have errors
            if profile_handler.has_error():
                print(f'!! error: {profile_handler.get_err_msg()}')

            self.assertTrue(profile_handler.has_error() is False)

        # re-retrieve it...
        return DataSetInfo.objects.get(object_id=dataset_info.object_id)
예제 #3
0
    def profile_file(self):
        """
        1. Get the DataSetInfo object by id
        2. Read the associated filefield
        3. Parse it into a dataframe
        4. Run the variable profiler on the dataframe
        5. Send results back via websocket
        """
        if self.has_error():
            return

        prunner = run_profile_by_filefield(
            self.dataset_info.object_id,
            max_num_features=settings.PROFILER_COLUMN_LIMIT)

        if prunner.has_error():
            user_msg = prunner.get_err_msg()
            self.add_err_msg(user_msg)
            self.send_websocket_profiler_err_msg(user_msg)
            return

        # self.data_profile = ph.get_data_profile()
        self.profile_variables = prunner.data_profile
        profile_str = json.dumps(self.profile_variables,
                                 cls=DjangoJSONEncoder,
                                 indent=4)

        self.send_websocket_success_msg('Profile complete!',
                                        profile_str=profile_str)
예제 #4
0
    def test_46_dataset_id_is_none(self):
        """(46) Test using bad DatasetInfo object id of None"""
        msgt(self.test_46_dataset_id_is_none.__doc__)

        # Run the profiler with a DatasetInfo object id of None
        profiler = profiler_tasks.run_profile_by_filefield(
            None, settings.PROFILER_COLUMN_LIMIT)

        self.assertTrue(profiler.has_error())
        self.assertTrue(
            dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND in profiler.get_err_msg())
예제 #5
0
    def test_47_dataset_id_is_empty_string(self):
        """(47) Test using bad DatasetInfo object id of None"""
        msgt(self.test_47_dataset_id_is_empty_string.__doc__)

        # Run the profiler with a DatasetInfo object id of empty string
        profiler = profiler_tasks.run_profile_by_filefield(
            '', settings.PROFILER_COLUMN_LIMIT)

        # Should be no error an correct number of features
        self.assertTrue(profiler.has_error())
        print(profiler.get_err_msg())
        self.assertTrue(dstatic.ERR_MSG_INVALID_DATASET_INFO_OBJECT_ID in
                        profiler.get_err_msg())
예제 #6
0
    def test_45_bad_dataset_id(self):
        """(45) Test using bad DatasetInfo object id"""
        msgt(self.test_45_bad_dataset_id.__doc__)

        # Get a real but incorrect object_id
        #
        reg_dv = RegisteredDataverse.objects.first()
        self.assertIsNotNone(reg_dv)
        bad_dataset_object_id = reg_dv.object_id

        # Run the profile using the Django file field
        profiler = profiler_tasks.run_profile_by_filefield(
            bad_dataset_object_id, settings.PROFILER_COLUMN_LIMIT)

        # Should be no error an correct number of features
        self.assertTrue(profiler.has_error())
        self.assertTrue(
            dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND in profiler.get_err_msg())
예제 #7
0
    def test_30_filefield_empty(self):
        """(30) Test with empty file field"""
        msgt(self.test_30_filefield_empty.__doc__)

        # Retrieve DataSetInfo
        #
        dsi = DataSetInfo.objects.get(object_id=self.ds_01_object_id)
        self.assertEqual(dsi.depositor_setup_info.user_step, \
                         DepositorSetupInfo.DepositorSteps.STEP_0100_UPLOADED)

        # Try to profile and empty Django FileField
        profiler = profiler_tasks.run_profile_by_filefield(dsi.object_id)

        # Error!
        self.assertTrue(profiler.has_error())
        self.assertTrue(pstatic.ERR_MSG_SOURCE_FILE_DOES_NOT_EXIST in
                        profiler.get_err_msg())

        # Retrieve the saved DataSetInfo, the DepositorSetupInfo should have a new status
        dsi2 = DataSetInfo.objects.get(object_id=self.ds_01_object_id)
        self.assertEqual(
            dsi2.depositor_setup_info.user_step,
            DepositorSetupInfo.DepositorSteps.STEP_9300_PROFILING_FAILED)
예제 #8
0
def test_downloader(dataverse_file_id=3):
    """Test profiler with file"""

    print('get object')
    dfi = DataverseFileInfo.objects.get(pk=dataverse_file_id)
    print('dfi: ', dfi)

    # ---------------------------
    # download check!
    # ---------------------------
    print('-' * 40)
    print('Download Info')
    print('-' * 40)
    dhandler = DataverseDownloadHandler(dfi)
    if dhandler.has_error():
        print('error: ', dhandler.get_err_msg())
        return
    else:
        print('looks good!')

    # ---------------------------
    # profile it!
    # ---------------------------
    print('-' * 40)
    print('Profile it')
    print('-' * 40)
    profile_handler = profiler_tasks.run_profile_by_filefield(dfi.object_id)
    if profile_handler.has_error():
        print('error: ', profile_handler.get_err_msg())
        return
    else:
        print(profile_handler.data_profile)
        print(
            json.dumps(profile_handler.data_profile,
                       cls=DjangoJSONEncoder,
                       indent=4))
        print('profiled!')
예제 #9
0
    def test_40_filefield_correct(self):
        """(40) Test using filefield with legit file"""
        msgt(self.test_40_filefield_correct.__doc__)

        # Retrieve DataSetInfo
        #
        dsi = DataSetInfo.objects.get(object_id=self.ds_01_object_id)
        self.assertEqual(dsi.depositor_setup_info.user_step, \
                         DepositorSetupInfo.DepositorSteps.STEP_0100_UPLOADED)

        # --------------------------------------------------
        # Attach the file to the DataSetInfo's file field
        # --------------------------------------------------
        filename = 'fearonLaitin.csv'
        filepath = join(TEST_DATA_DIR, filename)
        self.assertTrue(isfile(filepath))

        django_file = File(open(filepath, 'rb'))

        dsi.source_file.save(filename, django_file)
        dsi.save()

        # Run the profile using the Django file field
        profiler = profiler_tasks.run_profile_by_filefield(
            dsi.object_id, settings.PROFILER_COLUMN_LIMIT)

        # Should be no error an correct number of features
        # print('profiler.get_err_msg()', profiler.get_err_msg())
        self.assertTrue(profiler.has_error() is False)
        self.assertEqual(profiler.num_variables,
                         settings.PROFILER_COLUMN_LIMIT)

        # Re-retrieve DataSetInfo
        #
        dsi2 = DataSetInfo.objects.get(object_id=self.ds_01_object_id)

        info = dsi2.data_profile_as_dict()

        # print('-- Profiler reads only first 20 features')
        self.assertTrue('variables' in info)
        self.assertEqual(len(info['variables'].keys()),
                         settings.PROFILER_COLUMN_LIMIT)

        self.assertEqual(dsi2.depositor_setup_info.user_step, \
                         DepositorSetupInfo.DepositorSteps.STEP_0400_PROFILING_COMPLETE)

        #print('dsi2.profile_variables', dsi2.profile_variables)
        # self.assertEqual(len(dsi2.profile_variables['variables'].keys()),
        #                  settings.PROFILER_COLUMN_LIMIT)

        # self.assertEqual(dsi2.profile_variables['dataset']['variableCount'],
        #                  settings.PROFILER_COLUMN_LIMIT)

        self.assertEqual(
            dsi2.profile_variables['dataset']['variableCount'],
            len(dsi2.profile_variables['dataset']['variableOrder']))

        # make the sure the "dataset.variableOrder" column names are in the "variables" dict
        #
        for idx, colname in dsi2.profile_variables['dataset']['variableOrder']:
            self.assertTrue(colname in dsi2.profile_variables['variables'])
예제 #10
0
    def test_10_download_profile_success(self):
        """(10) Test successful download + profile"""
        msgt(self.test_10_download_profile_success.__doc__)

        dfi = DataverseFileInfo.objects.get(pk=3)
        self.assertTrue(not dfi.source_file)

        crisis_filepath = join(TEST_DATA_DIR, 'crisis.tab')
        print('crisis_filepath', crisis_filepath)
        self.assertTrue(isfile(crisis_filepath))

        with open(crisis_filepath, "rb") as data_file:
            responses.add(
                responses.GET,
                "https://dataverse.harvard.edu/api/access/datafile/101649",
                body=data_file.read(),
                status=200,
                content_type="text/tab-separated-values",
                stream=True,
            )

        # ---------------------------
        # Run the Downloader!
        # ---------------------------
        dhandler = DataverseDownloadHandler(dfi)
        print('dhandler.has_error()', dhandler.has_error())
        self.assertTrue(dhandler.has_error() is False)

        print('dfi.source_file', dfi.source_file)
        self.assertTrue(dfi.source_file)

        print('>>> new_file_name', dhandler.new_file_name)
        self.assertEqual(dhandler.new_file_name, 'crisis.tab')

        # ---------------------------
        # Run the Profile Handler!
        # ---------------------------
        profile_handler = profiler_tasks.run_profile_by_filefield(
            dfi.object_id)
        print('profile_handler.has_error()', profile_handler.has_error())
        if profile_handler.has_error():
            print(profile_handler.get_err_msg())
        self.assertTrue(profile_handler.has_error() is False)

        # data profile exists
        self.assertTrue(profile_handler.data_profile)

        # check that the count of variables matches the actual number of variables
        profile = profile_handler.data_profile
        self.assertTrue(
            len(profile['variables']) == profile['dataset']['variableCount'])

        # check for several features within the profile
        json_profile = json.dumps(profile_handler.data_profile,
                                  cls=DjangoJSONEncoder,
                                  indent=4)
        for fn in ['WARCRI', 'WARCASE', 'SCMEDIAN']:
            self.assertTrue(json_profile.find(fn) > -1)

        # Check the status on depositor_setup_info.DepositorSetupInfo
        #
        dfi2 = DataverseFileInfo.objects.get(pk=3)
        self.assertEqual(
            dfi2.depositor_setup_info.user_step,
            DepositorSetupInfo.DepositorSteps.STEP_0400_PROFILING_COMPLETE)