def test_report_feature_bow(self): list_of_sentences = ["Hi my name is pyml", "Hi name pyml"] columns = ["text"] data = pd.DataFrame(list_of_sentences, columns=columns) feature = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") feature.bag_of_words() with open(feature.report.filename) as f: content = f.read() validate = "representation" in content os.remove(feature.report.filename) self.assertTrue(validate)
def test_report_preprocessing_standardize(self): unnormal_data = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(unnormal_data, columns=columns) preprocess = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") preprocess.normalize_numeric() with open(preprocess.report.filename) as f: content = f.read() validate = "col1" in content and "col2" in content and "col3" in content os.remove(preprocess.report.filename) self.assertTrue(validate)
def test_dropcolumns_regex(self): int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4], [1, 2, 3, 6]] columns = ["agent.hi", "agent.user_name", "agent.hello", "message"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) clean.drop(regexp=r"agent*") validate = clean.x_train.columns == ["message"] self.assertTrue(validate)
def test_report_cleaning_technique(self): int_missing_data = np.array([(1, 0, 0), (0, None, None), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean.drop_column_missing_threshold(0.5) with open(clean.report.filename) as f: content = f.read() validate = "col2" in content and "col3" in content os.remove(clean.report.filename) self.assertTrue(validate)
def test_settargetmapping(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["A", "A", "B", "B"], }) clean = Data( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) clean.target_mapping = "a" self.assertEqual(clean.target_mapping, "a")
def test_where(self): data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(data, columns=columns) base = Data( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) subset = base.where(col1=0, col2=2, col3=[3, 4]) validate = subset.values.tolist() self.assertListEqual(validate, [[0, 2, 3]])
def test_encodelabels(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["Bca", "Abc", "Bca", "Bca"], }) clean = Data( x_train=data, x_test=None, split=False, target_field="D", report_name="", test_split_percentage=0.5, ) clean.encode_target() self.assertDictEqual(clean.target_mapping, {0: "Abc", 1: "Bca"})
def test_preprocessnumeric_log(self): unnormal_data = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(unnormal_data, columns=columns) preprocess = Data(x_train=data, test_split_percentage=0.5, report_name="test") preprocess.normalize_log() preprocess.normalize_log(base=2) preprocess.normalize_log(base=10) self.assertTrue(True)
def test_cleanutil_replacebfill(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data(x_train=data, test_split_percentage=0.5, report_name="test") clean.replace_missing_backfill("col1", "col2", "col3") self.assertTrue(True)
def test_search_notequal(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["A", "A", "B", "B"], }) clean = Data( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) clean.search("A", not_equal=True, replace=True) self.assertTrue(True)
def test_groupbyanalysis(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["A", "A", "B", "B"], }) base = Data( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) base.groupby_analysis(["A"]) self.assertTrue(True)
def test_cleanutil_replacerandomdiscrete(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data(x_train=data, test_split_percentage=0.5, report_name="test") clean.replace_missing_random_discrete("col1", "col2", "col3") validate = np.any(clean.x_train.isnull()) and np.any( clean.x_test.isnull()) self.assertFalse(validate)
def test_report_cleaning_new_category(self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) clean = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean_data = clean.replace_missing_new_category() with open(clean.report.filename) as f: content = f.read() validate = "col1" in content and "col2" in content and "col3" in content os.remove(clean.report.filename) self.assertTrue(validate)
def test_preprocess_traindata(self): unnormal_x_train = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]] unnormal_x_test = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]] columns = ["col1", "col2", "col3"] x_train = pd.DataFrame(unnormal_x_train, columns=columns) x_test = pd.DataFrame(unnormal_x_test, columns=columns) preprocess = Data( x_train=x_train, x_test=x_test, test_split_percentage=0.5, report_name="test", ) preprocess.normalize_numeric("col1", "col2", "col3") validate_train = preprocess.x_train.values.tolist() validate_test = preprocess.x_test.values.tolist() self.assertListEqual(validate_train, validate_test)
def test_ytrain_dne(self): data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) base.y_train = [1, 1] validate = base.x_train["label"].tolist() == [ 1, 1, ] and base.y_train.tolist() == [1, 1] self.assertTrue(validate)
def test_normalize_column_names(self): data = np.zeros((4, 4)) columns = ["PID", "CapsLock", "space column name", "Caps Space"] data = pd.DataFrame(data, columns=columns) base = Data( x_train=data, x_test=data, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) base.standardize_column_names() validate = (base.columns == [ "pid", "capslock", "space_column_name", "caps_space" ] and base.x_test.columns.tolist() == base.x_train.columns.tolist()) self.assertTrue(validate)
def test_preprocess_splitsentences(self): text_data = [ "Hi my name is aethos. Please split me.", "This function is going to split by sentence. Automation is great.", ] data = pd.DataFrame(data=text_data, columns=["data"]) prep = Data(x_train=data, split=False, report_name="test") prep.split_sentences("data") validate = prep.x_train["data_sentences"].values.tolist() self.assertListEqual( validate, [ ["Hi my name is aethos.", "Please split me."], [ "This function is going to split by sentence.", "Automation is great.", ], ], )
def test_featureextractioncategorical_onehot(self): normal_data = [ ["Green", "Green", "Car"], ["Green", "Other", "Truck"], ["Blue", "Other", "Truck"], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(normal_data, columns=columns) feature = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") feature.onehot_encode(list_of_cols=["col1", "col3"], keep_col=False) validate = feature.x_train.values.tolist() self.assertListEqual( validate, [["Green", 0, 1, 1, 0], ["Other", 0, 1, 0, 1], ["Other", 1, 0, 0, 1]], )
def test_cleanutil_replaceindicator_removecol(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data(x_train=data, test_split_percentage=0.5, report_name="test") clean.replace_missing_indicator("col1", "col2", "col3", keep_col=False) validate = (clean.x_train.shape[1] == 3) and (clean.x_test.shape[1] == 3) self.assertTrue(True)
def test_json_normalize_split(self): data = pd.DataFrame({ "col1": [1, 2], "col2": [ ast.literal_eval("{'foo':1, 'bar':2, 'baz':{'foo':2, 'x':1}}"), ast.literal_eval("{'foo':3, 'bar':5, 'baz':{'foo':2, 'x':1}}"), ], }) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) base.expand_json_column("col2") self.assertListEqual(base.x_test.columns.tolist(), ["col1", "foo", "bar", "baz_foo", "baz_x"])
def test_cleancategorical_replacemissingnewcategory_list_constantnotnone( self): missing_data = np.array([(1, "Green", 2), (1, "Other", 1), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) list_col = ["col1", "col3"] clean = Data(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean.replace_missing_new_category(list_of_cols=list_col, new_category=0) validate = clean.x_train.values.tolist() self.assertListEqual( validate, np.array([(1, "Green", 2), (1, "Other", 1), (0, None, 0)]).tolist(), )
def test_dropcolumns_complex(self): int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4], [1, 2, 3, 6]] columns = ["col1", "col2", "col3", "py"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) clean.drop("col1", keep=["col2"], regexp=r"col*", reason="Columns were unimportant.") validate = list(clean.x_train.columns) == ["col2", "py"] and list( clean.x_test.columns) == ["col2", "py"] self.assertTrue(validate)
def test_getattr(self): int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) self.assertIsNotNone(base.col1)
def test_columns_property(self): data = pd.DataFrame(np.random.rand(100, 10)) base = Data( x_train=data, x_test=None, split=True, target_field="col3", report_name="test", test_split_percentage=0.5, ) validate = base.columns self.assertTrue(len(validate) == 10)
def test_dropcolumns_error(self): int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4], [1, 2, 3, 6]] columns = ["col1", "col2", "col3", "py"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) self.assertRaises(TypeError, clean.drop, keep="col2")
def test_ytest_split(self): data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="col3", report_name="test", test_split_percentage=0.5, ) validate = len(base.y_test) == 2 self.assertTrue(validate)
def test_setattr_trainset(self): int_missing_data = [[1, 0, 0, 1], [0, 2, 3, 1], [0, 3, 4, 1], [1, 2, 3, 1]] columns = ["col1", "col2", "col3", "col4"] data = pd.DataFrame(int_missing_data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.75, ) base["col5"] = [4] self.assertListEqual(base["col5"].tolist(), [4])
def test_setitem_tupleeven(self): int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) base["col4"] = ([5, 5], [2, 2]) validate = any(base.x_train["col4"].isnull()) and any( base.x_test["col4"].isnull()) self.assertFalse(validate)
def test_missing_data(self): int_missing_data = [ [np.NaN, np.NaN, np.NaN], [0, 2, 3], [np.NaN, np.NaN, np.NaN], [np.NaN, np.NaN, np.NaN], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) base = Data( x_train=data, x_test=None, split=True, target_field="", report_name="", test_split_percentage=0.5, ) base.missing_values self.assertTrue(True)