def test_structured_data_labeler_fit_predict_take_data_obj(self):
     data = pd.DataFrame(
         [
             "123 Fake st",
             "1/1/2021",
             "blah",
             "333-44-2341",
             "*****@*****.**",
             "John Doe",
             "123-4567",
         ]
     )
     labels = pd.DataFrame(
         [
             "ADDRESS",
             "DATETIME",
             "UNKNOWN",
             "SSN",
             "EMAIL_ADDRESS",
             "PERSON",
             "PHONE_NUMBER",
         ]
     )
     for dt in ["csv", "json", "parquet"]:
         data_obj = dp.Data(data=data, data_type=dt)
         label_obj = dp.Data(data=labels, data_type=dt)
         labeler = dp.DataLabeler(labeler_type="structured", trainable=True)
         self.assertIsNotNone(labeler.fit(x=data_obj, y=label_obj))
         self.assertIsNotNone(labeler.predict(data=data_obj))
Пример #2
0
    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp1')
        profile1 = dp.Profiler(data, profiler_options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp2')
        profile2 = dp.Profiler(data, profiler_options=profile_options)

        profile1.update_profile(data)
    def test_unstructured_data_labeler_fit_predict_take_data_obj(self):
        # Determine string index in joined data at cell i
        def data_ind(i, data):
            # Take off 1 in base case so we don't include trailing comma
            if i == -1:
                return -1
            # Add 1 with every pass to account for commas
            return len(data[i]) + 1 + data_ind(i - 1, data)

        # Generate entities list for a set of structured data and labels
        def entities(data, labels):
            return [(0, len(data[0]), labels[0])] + \
                   [(data_ind(i - 1, data) + 1, data_ind(i, data), labels[i])
                    for i in range(1, len(data))]

        data_cells = [
            "123 Fake st", "1/1/2021", "blah", "555-55-5555",
            "*****@*****.**", "John Doe", "123-4567"
        ]
        label_cells = [
            "ADDRESS", "DATETIME", "UNKNOWN", "SSN", "EMAIL_ADDRESS", "PERSON",
            "PHONE_NUMBER"
        ]

        # Test with one large string of data
        data_str = ",".join(data_cells)
        label_str = entities(data_cells, label_cells)
        for dt in ["csv", "json", "parquet"]:
            data_obj = dp.Data(data=pd.DataFrame([data_str]), data_type=dt)
            labeler = dp.DataLabeler(labeler_type="unstructured",
                                     trainable=True)
            self.assertIsNotNone(labeler.fit(x=data_obj, y=[label_str]))
            self.assertIsNotNone(labeler.predict(data=data_obj))

        # Test with the string broken up into different df entries
        data_1 = data_cells[:3]
        data_2 = data_cells[3:5]
        data_3 = data_cells[5:]
        data_df = pd.DataFrame(
            [",".join(data_1), ",".join(data_2), ",".join(data_3)])
        zipped = [(data_1, label_cells[:3]), (data_2, label_cells[3:5]),
                  (data_3, label_cells[5:])]
        three_labels = [entities(d, l) for (d, l) in zipped]
        for dt in ["csv", "json", "parquet"]:
            data_obj = dp.Data(data=data_df, data_type=dt)
            labeler = dp.DataLabeler(labeler_type="unstructured",
                                     trainable=True)
            self.assertIsNotNone(labeler.fit(x=data_obj, y=three_labels))
            self.assertIsNotNone(labeler.predict(data=data_obj))

        # Test with text data object
        text_obj = dp.Data(data=data_str, data_type="text")
        labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True)
        self.assertIsNotNone(labeler.fit(x=text_obj, y=[label_str]))
        self.assertIsNotNone(labeler.predict(data=text_obj))
Пример #4
0
    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        for i in range(3):
            print('running dp =============================', i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.set({
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
            })

            profile = dp.Profiler(data, profiler_options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for col in results['data_stats']:
                columns.append(col)
                predictions.append(results['data_stats'][col]['data_label'])
    def test_warning_tf(self):

        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')
        data = dp.Data(path)

        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "datetime.is_enabled": False
        })

        profile = dp.StructuredProfiler(data, options=profile_options)
        results = profile.report()

        columns = []
        predictions = []
        for i in range(len(results['data_stats'])):
            columns.append(i)
            predictions.append(results['data_stats'][i]['data_label'])
 def test_text_data_raises_error(self):
     text_file_path = os.path.join(test_root_path, 'data',
                                   'txt/sentence-10x.txt')
     with self.assertRaisesRegex(
             TypeError, 'Cannot provide TextData object'
             ' to Profiler'):
         profile = dp.Profiler(dp.Data(text_file_path))
    def test_save_and_load(self):
        datapth = "dataprofiler/tests/data/"
        test_files = ["csv/guns.csv", "csv/iris.csv"]

        def _clean_report(report):
            data_stats = report["data_stats"]
            for key in data_stats:
                stats = data_stats[key]["statistics"]
                if "histogram" in stats:
                    if "bin_counts" in stats["histogram"]:
                        stats["histogram"]["bin_counts"] = \
                            stats["histogram"]["bin_counts"].tolist()
                    if "bin_edges" in stats["histogram"]:
                        stats["histogram"]["bin_edges"] = \
                            stats["histogram"]["bin_edges"].tolist()
            return report

        for test_file in test_files:
            # Create Data and Profiler objects
            data = dp.Data(os.path.join(datapth, test_file))
            save_profile = dp.Profiler(data)

            # Save and Load profile with Mock IO
            with mock.patch('builtins.open') as m:
                mock_file = setup_save_mock_open(m)
                save_profile.save()
                mock_file.seek(0)
                load_profile = dp.Profiler.load("mock.pkl")

            # Check that reports are equivalent
            save_report = _clean_report(save_profile.report())
            load_report = _clean_report(load_profile.report())
            self.assertDictEqual(save_report, load_report)
    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        for i in range(3):
            print("running dp =============================", i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.structured_options.set(
                {
                    "text.is_enabled": False,
                    "int.is_enabled": False,
                    "float.is_enabled": False,
                    "order.is_enabled": False,
                    "category.is_enabled": False,
                    "chi2_homogeneity.is_enabled": False,
                    "datetime.is_enabled": False,
                }
            )

            profile = dp.StructuredProfiler(data, options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for j in range(len(results["data_stats"])):
                columns.append(j)
                predictions.append(results["data_stats"][j]["data_label"])
Пример #9
0
    def test_accepted_inputs(self):
        with self.assertRaisesRegex(
                TypeError, "Input data must be either a "
                "`pd.DataFrame` or a `data_profiler.Data` "
                "and not of type `TextData`."):
            dp.train_structured_labeler(None, None)

        with self.assertRaisesRegex(TypeError,
                                    "The output dirpath must be a string."):
            dp.train_structured_labeler(pd.DataFrame([]), save_dirpath=0)

        # doesn't accept text data
        text_data = dp.Data(data='test', data_type='text')
        with self.assertRaisesRegex(
                TypeError, "Input data must be either a "
                "`pd.DataFrame` or a `data_profiler.Data` "
                "and not of type `TextData`."):
            dp.train_structured_labeler(text_data, None)

        with self.assertRaisesRegex(
                ValueError, "The `save_dirpath` is not valid or not "
                "accessible."):
            dp.train_structured_labeler(pd.DataFrame([]), "/a/test")

        try:
            data = {
                'BACKGROUND': ["Beep", "Boop"],
                'PERSON': ["GRANT", "MENSHENG"]
            }
            df = pd.DataFrame(data=data)
            dp.train_structured_labeler(df, save_dirpath=None)

            fake_data = dp.Data(data=df, data_type='csv')
            dp.train_structured_labeler(fake_data, save_dirpath=None)

            fake_data = dp.Data(data=df, data_type='json')
            dp.train_structured_labeler(fake_data, save_dirpath=None)

            fake_data = dp.Data(data=df, data_type='parquet')
            dp.train_structured_labeler(fake_data, save_dirpath=None)

        except Exception as e:
            self.fail(str(e))
Пример #10
0
    def test_multi_labelers(self, *mocks):
        """
        Test Multiple labelers called consecutively.
        :return:
        """
        data = dp.Data(data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str),
                       data_type='parquet')
        data2 = dp.Data(data=pd.DataFrame(['atest', 'b', 'c']),
                        data_type='csv')

        structured_labeler_1 = dp.DataLabeler(labeler_type='structured')
        structured_labeler_1.predict(data)
        unstructured_labeler = dp.DataLabeler(labeler_type='unstructured')
        unstructured_labeler._label_encoding = {
            'PAD': 0,
            'CITY': 1,  # SAME AS BACKGROUND
            'BACKGROUND': 1,
            'ADDRESS': 2,
            'BAN': 3,
            'CREDIT_CARD': 4,
            'EMAIL_ADDRESS': 5,
            'UUID': 6,
            'HASH_OR_KEY': 7,
            'IPV4': 8,
            'IPV6': 9,
            'MAC_ADDRESS': 10,
            'NAME': 11,  # SAME AS PERSON
            'PERSON': 11,
            'PHONE_NUMBER': 12,
            'SSN': 13,
            'URL': 14,
            'DATETIME': 15,
            'INTEGER_BIG': 16,  # SAME AS INTEGER
            'INTEGER': 16,
            'FLOAT': 17,
            'QUANTITY': 18,
            'ORDINAL': 19
        }

        unstructured_labeler.predict(data)
        structured_labeler_2 = dp.DataLabeler(labeler_type='structured')
        structured_labeler_2.predict(data2)
Пример #11
0
    def test_multi_labelers(self, *mocks):
        """
        Test Multiple labelers called consecutively.
        :return:
        """
        data = dp.Data(
            data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type="parquet"
        )
        data2 = dp.Data(data=pd.DataFrame(["atest", "b", "c"]), data_type="csv")

        structured_labeler_1 = dp.DataLabeler(labeler_type="structured")
        structured_labeler_1.predict(data)
        unstructured_labeler = dp.DataLabeler(labeler_type="unstructured")
        unstructured_labeler._label_encoding = {
            "PAD": 0,
            "CITY": 1,  # SAME AS UNKNOWN
            "UNKNOWN": 1,
            "ADDRESS": 2,
            "BAN": 3,
            "CREDIT_CARD": 4,
            "EMAIL_ADDRESS": 5,
            "UUID": 6,
            "HASH_OR_KEY": 7,
            "IPV4": 8,
            "IPV6": 9,
            "MAC_ADDRESS": 10,
            "NAME": 11,  # SAME AS PERSON
            "PERSON": 11,
            "PHONE_NUMBER": 12,
            "SSN": 13,
            "URL": 14,
            "DATETIME": 15,
            "INTEGER_BIG": 16,  # SAME AS INTEGER
            "INTEGER": 16,
            "FLOAT": 17,
            "QUANTITY": 18,
            "ORDINAL": 19,
        }

        unstructured_labeler.predict(data)
        structured_labeler_2 = dp.DataLabeler(labeler_type="structured")
        structured_labeler_2.predict(data2)
    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp1")
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp2")
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile1.update_profile(data)
    def test_warning_tf_run_dp_merge(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp1')
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp2')
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile = profile1 + profile2
    def test_null_in_file(self):
        filename_null_in_file = os.path.join(
            test_root_path, 'data', 'csv/sparse-first-and-last-column.txt')
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        data = dp.Data(filename_null_in_file)
        profile = dp.Profiler(data, profiler_options=profiler_options)

        report = profile.report(report_options={"output_format":"pretty"})
        
        self.assertEqual(
            report['data_stats']['COUNT']['statistics']['null_types_index'],
            {'': '[2, 3, 4, 5, 7, 8]'}
        )
        
        self.assertEqual(
            report['data_stats'][' NUMBERS']['statistics']['null_types_index'],
            {'': '[5, 6, 8]', ' ': '[2, 4]'}
        )
Пример #15
0
    def test_check_and_return_valid_data_format(self):
        # test incorrect fit_or_predict value
        with self.assertRaisesRegex(ValueError, '`fit_or_predict` must equal '
                                                '`fit` or `predict`'):
            BaseDataLabeler._check_and_return_valid_data_format([], 'oops')

        # test incorrect data type
        with self.assertRaisesRegex(TypeError, "Data must be imported using the"
                                               " data_readers, pd.DataFrames, "
                                               "np.ndarrays, or lists."):
            BaseDataLabeler._check_and_return_valid_data_format('oops')

        # test proper conversion of 2 dimensional structured data
        two_dim = [["this", "is"], ["two", "dimensions"]]
        two_dim_pred = np.array(["this", "is", "two", "dimensions"])
        # for fit
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           two_dim, fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(two_dim), fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(two_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(two_dim), fit_or_predict='fit')))
        # for predict
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           two_dim, fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(two_dim), fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(two_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(two_dim), fit_or_predict='predict')))

        # test proper conversion of 1 dimensional data
        one_dim = ["this", "is", "one", "dimension"]
        one_dim_pred = np.array(one_dim)
        # for fit
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           one_dim, fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.Series(one_dim), fit_or_predict='fit')))
        self.assertTrue(
            np.array_equal(np.array(one_dim),
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(one_dim), fit_or_predict='fit')))
        # for predict
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           one_dim, fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           pd.DataFrame(one_dim), fit_or_predict='predict')))
        self.assertTrue(
            np.array_equal(one_dim_pred,
                           BaseDataLabeler._check_and_return_valid_data_format(
                           np.array(one_dim), fit_or_predict='predict')))

        # test proper conversion of unstructured labels
        labels = [[(0, 4, "UNKNOWN"), (4, 10, "ADDRESS")],
                  [(0, 5, "SSN"), (5, 8, "UNKNOWN")]]
        validated_labels = \
            BaseDataLabeler._check_and_return_valid_data_format(labels)
        self.assertIsInstance(validated_labels, np.ndarray)
        self.assertEqual(len(validated_labels), 2)
        self.assertEqual(len(validated_labels[0]), 2)
        self.assertEqual(len(validated_labels[0][0]), 3)
        self.assertEqual(validated_labels[0][0][0], 0)
        self.assertEqual(validated_labels[0][1][1], 10)
        self.assertEqual(validated_labels[1][0][2], "SSN")

        # test proper conversion of data reader objects
        for dt in ["csv", "json", "parquet"]:
            data_obj = dp.Data(data=pd.DataFrame(two_dim), data_type=dt)
            val = BaseDataLabeler._check_and_return_valid_data_format(data_obj)
            self.assertTrue(np.array_equal(np.array(two_dim), val))
Пример #16
0
    def test_accepted_inputs(self):
        with self.assertRaisesRegex(TypeError,
                                    "Input data must be either a "
                                    "`pd.DataFrame` or a `data_profiler.Data` "
                                    "and not of type `TextData`."):
            dp.train_structured_labeler(None)

        with self.assertRaisesRegex(TypeError,
                                    "The output dirpath must be a string."):
            dp.train_structured_labeler(pd.DataFrame([]), save_dirpath=0)

        with self.assertRaisesRegex(ValueError,
                                    "`default_label` must be a string."):
            dp.train_structured_labeler(pd.DataFrame([]), default_label=1)

        # doesn't accept text data
        text_data = dp.Data(data='test', data_type='text')
        with self.assertRaisesRegex(TypeError,
                                    "Input data must be either a "
                                    "`pd.DataFrame` or a `data_profiler.Data` "
                                    "and not of type `TextData`."):
            dp.train_structured_labeler(text_data)

        with self.assertRaisesRegex(ValueError,
                                    "The `save_dirpath` is not valid or not "
                                    "accessible."):
            dp.train_structured_labeler(
                pd.DataFrame([]), save_dirpath="/a/test")

        # default label not in the label mapping
        data = {'LABEL1': ["word1", "word2"],
                'LABEL2': ["word3", "word4"]}
        df = pd.DataFrame(data=data)

        with self.assertRaisesRegex(ValueError,
                                    "The `default_label` of UNKNOWN must "
                                    "exist in the label mapping."):
            dp.train_structured_labeler(df)

        try:
            data = {'UNKNOWN': ["Beep", "Boop"],
                    'PERSON': ["GRANT", "MENSHENG"]}
            df = pd.DataFrame(data=data)
            dp.train_structured_labeler(df)

            fake_data = dp.Data(data=df, data_type='csv')
            dp.train_structured_labeler(fake_data)

            fake_data = dp.Data(data=df, data_type='json')
            dp.train_structured_labeler(fake_data)

            fake_data = dp.Data(data=df, data_type='parquet')
            dp.train_structured_labeler(fake_data)

        except Exception as e:
            self.fail(str(e))

        # set default label to be in label mapping
        data = {'LABEL1': ["word1", "word2"],
                'LABEL2': ["word3", "word4"]}
        df = pd.DataFrame(data=data)

        try:
            default_label = 'LABEL1'
            data_labeler = dp.train_structured_labeler(
                df, default_label=default_label)
            self.assertTrue(default_label in data_labeler.label_mapping)
            self.assertEqual(default_label,
                             data_labeler.model._parameters['default_label'])
        except Exception as e:
            self.fail(str(e))
Пример #17
0
# parameter alteration
ALLOW_SUBSAMPLING = True  # profiler to subsample the dataset if large
PERCENT_TO_NAN = 0.0  # Value must be between 0 and 100

sample_sizes = [100, 1000, 5000, 7500, int(1e5)]
################################################################################

if __name__ == "__main__":

    # set seed
    random.seed(0)
    np.random.seed(0)
    dp.set_seed(0)

    # load data
    data = dp.Data('data/time_structured_profiler.csv')

    # [0] allows model to be initialzied and added to labeler
    sample_sizes = [0] + sample_sizes
    profile_times = []
    for sample_size in sample_sizes:
        # setup time dict

        print(f"Evaluating sample size: {sample_size}")
        df = data.data.sample(sample_size, replace=True).reset_index(drop=True)

        if PERCENT_TO_NAN:
            samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100)
            for col_name in df:
                ind_to_nan = random.sample(list(df.index), samples_to_nan)
                df[col_name][ind_to_nan] = 'None'
Пример #18
0
# parameter alteration
ALLOW_SUBSAMPLING = True  # profiler to subsample the dataset if large
PERCENT_TO_NAN = 0.0  # Value must be between 0 and 100

sample_sizes = [100, 1000, 5000, 7500, int(1e5)]
################################################################################

if __name__ == "__main__":

    # set seed
    random.seed(0)
    np.random.seed(0)
    dp.set_seed(0)

    # load data
    data = dp.Data("data/time_structured_profiler.csv")

    # [0] allows model to be initialzied and added to labeler
    sample_sizes = [0] + sample_sizes
    profile_times = []
    for sample_size in sample_sizes:
        # setup time dict

        print(f"Evaluating sample size: {sample_size}")
        df = data.data.sample(sample_size, replace=True).reset_index(drop=True)

        if PERCENT_TO_NAN:
            samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100)
            for col_name in df:
                ind_to_nan = random.sample(list(df.index), samples_to_nan)
                df[col_name][ind_to_nan] = "None"