def test_run(self):
        # part 1
        imputer = MeanImputation(hyperparams=hp)

        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        # print(imputer.get_params())
        self.assertEqual(imputer._has_finished, True)
        self.assertEqual(imputer._iterations_done, True)

        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        self.helper_impute_result_check(X, result)

        # part2: test set_params()
        imputer2 = MeanImputation(hyperparams=hp)

        imputer2.set_params(params=imputer.get_params())
        self.assertEqual(imputer2._has_finished, True)
        self.assertEqual(imputer2._iterations_done, True)

        result2 = imputer2.produce(inputs=X, timeout=self.enough_time).value
        self.assertEqual(result2.equals(result),
                         True)  # two imputers' results should be same
        self.assertEqual(imputer2._has_finished, True)
        self.assertEqual(imputer2._iterations_done, True)
    def test_notAlign(self):
        """
        test the case that the missing value situations in trainset and testset are not aligned. eg:
            `a` missing-value columns in trainset, `b` missing-value columns in testset.
            `a` > `b`, or `a` < `b`
        """

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # PART1: when `a` > `b`
        data2 = result.copy()
        data2["T3"] = X["T3"].copy(
        )  # only set this column to original column, with missing vlaues
        result2 = imputer.produce(inputs=data2, timeout=self.enough_time).value
        self.helper_impute_result_check(data2, result2)

        # PART2: when `a` < `b`

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=data2)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # data contains more missingvalue columns than data2,
        # the imputer should triger default impute method for the column that not is trained
        self.helper_impute_result_check(X, result)

        # PART3: trunk the data : sample wise

        imputer = MeanImputation(hyperparams=hp)
        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X[0:20],
                                 timeout=self.enough_time).value
        self.helper_impute_result_check(X[0:20], result)
    def test_init(self):

        imputer = MeanImputation(hyperparams=hp)
        self.assertEqual(imputer._has_finished, False)
        self.assertEqual(imputer._iterations_done, False)
    def test_noMV(self):
        """
        test on the dataset has no missing values
        """
        imputer = MeanImputation(hyperparams=hp)

        imputer.set_training_data(inputs=X)
        imputer.fit(timeout=self.enough_time)
        result = imputer.produce(inputs=X, timeout=self.enough_time).value
        # 1. check produce(): `result` contains no missing value
        result2 = imputer.produce(inputs=result,
                                  timeout=self.enough_time).value

        self.assertEqual(result.equals(result2), True)

        # 2. check fit() & get_params() try fit on no-missing-value dataset
        imputer2 = MeanImputation(hyperparams=hp)
        imputer2.set_training_data(inputs=result)
        imputer2.fit(timeout=self.enough_time)
Пример #5
0
print(trainData.head())
print(trainTargets.head())
print(np.asarray(trainTargets['Class']))
print(testData.head())

hp = EncHyperparameter.sample()
enc = Encoder(hyperparams=hp)
enc.set_training_data(inputs=trainData)
enc.fit()
encodedData = enc.produce(inputs=trainData).value
encodedTestData = enc.produce(inputs=testData).value

# Initialize the DSBox imputer
hp = MeanHyperparameter.sample()
imputer = MeanImputation(hyperparams=hp)
imputer.set_training_data(inputs=encodedData)  # unsupervised
imputer.fit(timeout=100)  # give 100 seconds to fit
print("\nParams:")
print(imputer.get_params())

imputer2 = MeanImputation(hyperparams=hp)
imputer2.set_params(params=imputer.get_params())

imputedData = imputer2.produce(inputs=encodedData, timeout=100).value

model = BaggingClassifier()
trainedModel = model.fit(imputedData, np.asarray(trainTargets['Class']))

predictedTargets = trainedModel.predict(
    imputer.produce(inputs=encodedTestData).value)
Пример #6
0
    def setUp(self):
        imputer = MeanImputation(hyperparams=hp)

        self.enough_time = 100
        self.not_enough_time = 0.000001
Пример #7
0
testData = pd.read_csv(path.join(dataRoot, 'testData.csv'))

print(trainData.head())
print(trainTargets.head())
print(np.asarray(trainTargets['Class']))
print(testData.head())

enc = Encoder()
enc.set_training_data(inputs=trainData)
enc.set_params(params=Params(n_limit=10, text2int=True))
enc.fit()
encodedData = enc.produce(inputs=trainData)
encodedTestData = enc.produce(inputs=testData)

# Initialize the DSBox imputer
imputer = MeanImputation()
imputer.set_params(verbose=0)
imputer.set_training_data(inputs=encodedData)  # unsupervised
imputer.fit(timeout=10)  # give 10 seconds to fit
print(imputer.get_call_metadata())  # to see wether fit worked
imputedData = imputer.produce(inputs=encodedData, timeout=10)
print(imputer.get_call_metadata())  # to see wether produce worked

model = BaggingClassifier()
trainedModel = model.fit(imputedData, np.asarray(trainTargets['Class']))

predictedTargets = trainedModel.predict(
    imputer.produce(inputs=encodedTestData))
print(predictedTargets)

# Outputs the predicted targets in the location specified in the JSON configuration file
Пример #8
0
"""
sample program for classification problem
"""
def text2int(col):
    """
    convert column value from text to integer codes (0,1,2...)
    """
    return pd.DataFrame(col.astype('category').cat.codes,columns=[col.name])

import pandas as pd

from dsbox.datapreprocessing.cleaner import MeanImputation

# STEP 1: get data
data_path = "../../dsbox-data/o_38/encoded/"
data_name = data_path + "trainData_encoded.csv"
label_name = data_path + "trainTargets_encoded.csv" # make sure your label target is in the second column of this file

data = pd.read_csv(data_name)
label = text2int(pd.read_csv(label_name)["Class"])

data.drop("d3mIndex",axis=1)    # drop because id, useless

# STEP 2: go to use the Imputer !
imputer = MeanImputation(verbose=1)
result = imputer.produce(inputs=data, timeout=10)
print (imputer.get_call_metadata())	# to see wether produce worked