def test_run(self):
        # part 1
        imputer = MeanImputation(hyperparams=hp)

        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        # print(imputer.get_params())
        self.assertEqual(imputer._has_finished, True)
        self.assertEqual(imputer._iterations_done, True)

        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        self.helper_impute_result_check(X, result)

        # part2: test set_params()
        imputer2 = MeanImputation(hyperparams=hp)

        imputer2.set_params(params=imputer.get_params())
        self.assertEqual(imputer2._has_finished, True)
        self.assertEqual(imputer2._iterations_done, True)

        result2 = imputer2.produce(inputs=X, timeout=self.enough_time).value
        self.assertEqual(result2.equals(result),
                         True)  # two imputers' results should be same
        self.assertEqual(imputer2._has_finished, True)
        self.assertEqual(imputer2._iterations_done, True)
    def test_noMV(self):
        """
        test on the dataset has no missing values
        """
        imputer = MeanImputation(hyperparams=hp)

        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # 1. check produce(): `result` contains no missing value
        result2 = imputer.produce(inputs=result,
                                  timeout=self.enough_time).value

        self.assertEqual(result.equals(result2), True)

        # 2. check fit() & get_params() try fit on no-missing-value dataset
        imputer2 = MeanImputation(hyperparams=hp)
        imputer2.set_training_data(inputs=result)
        imputer2.fit(timeout=self.enough_time)
    def test_notAlign(self):
        """
        test the case that the missing value situations in trainset and testset are not aligned. eg:
            `a` missing-value columns in trainset, `b` missing-value columns in testset.
            `a` > `b`, or `a` < `b`
        """

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # PART1: when `a` > `b`
        data2 = result.copy()
        data2["T3"] = X["T3"].copy(
        )  # only set this column to original column, with missing vlaues
        result2 = imputer.produce(inputs=data2, timeout=self.enough_time).value
        self.helper_impute_result_check(data2, result2)

        # PART2: when `a` < `b`

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=data2)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # data contains more missingvalue columns than data2,
        # the imputer should triger default impute method for the column that not is trained
        self.helper_impute_result_check(X, result)

        # PART3: trunk the data : sample wise

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X[0:20],
                                 timeout=self.enough_time).value
        self.helper_impute_result_check(X[0:20], result)
Exemplo n.º 4
0
print(trainData.head())
print(trainTargets.head())
print(np.asarray(trainTargets['Class']))
print(testData.head())

hp = EncHyperparameter.sample()
enc = Encoder(hyperparams=hp)
enc.set_training_data(inputs=trainData)
enc.fit()
encodedData = enc.produce(inputs=trainData).value
encodedTestData = enc.produce(inputs=testData).value

# Initialize the DSBox imputer
hp = MeanHyperparameter.sample()
imputer = MeanImputation(hyperparams=hp)
imputer.set_training_data(inputs=encodedData)  # unsupervised
imputer.fit(timeout=100)  # give 100 seconds to fit
print("\nParams:")
print(imputer.get_params())

imputer2 = MeanImputation(hyperparams=hp)
imputer2.set_params(params=imputer.get_params())

imputedData = imputer2.produce(inputs=encodedData, timeout=100).value

model = BaggingClassifier()
trainedModel = model.fit(imputedData, np.asarray(trainTargets['Class']))

predictedTargets = trainedModel.predict(
    imputer.produce(inputs=encodedTestData).value)
def text2int(col):
    """
    convert column value from text to integer codes (0,1,2...)
    """
    return pd.DataFrame(col.astype('category').cat.codes, columns=[col.name])


import pandas as pd

from dsbox.datapreprocessing.cleaner import MeanImputation

# STEP 1: get data
data_path = "../../dsbox-data/o_38/encoded/"
data_name = data_path + "trainData_encoded.csv"
label_name = data_path + "trainTargets_encoded.csv"  # make sure your label target is in the second column of this file

data = pd.read_csv(data_name)
label = text2int(pd.read_csv(label_name)["Class"])

data.drop("d3mIndex", axis=1)  # drop because id, useless

# STEP 2: go to use the Imputer !
imputer = MeanImputation()
imputer.set_params(verbose=1)
imputer.set_training_data(inputs=data)  # unsupervised
imputer.fit(timeout=10)  # give 10 seconds to fit
print(imputer.get_call_metadata())  # to see wether fit worked
result = imputer.produce(inputs=data, timeout=10)
print(imputer.get_call_metadata())  # to see wether produce worked