def test_run(self): # part 1 imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=X) imputer.fit(timeout=self.enough_time) # print(imputer.get_params()) self.assertEqual(imputer._has_finished, True) self.assertEqual(imputer._iterations_done, True) result = imputer.produce(inputs=X, timeout=self.enough_time).value self.helper_impute_result_check(X, result) # part2: test set_params() imputer2 = MeanImputation(hyperparams=hp) imputer2.set_params(params=imputer.get_params()) self.assertEqual(imputer2._has_finished, True) self.assertEqual(imputer2._iterations_done, True) result2 = imputer2.produce(inputs=X, timeout=self.enough_time).value self.assertEqual(result2.equals(result), True) # two imputers' results should be same self.assertEqual(imputer2._has_finished, True) self.assertEqual(imputer2._iterations_done, True)
def test_notAlign(self): """ test the case that the missing value situations in trainset and testset are not aligned. eg: `a` missing-value columns in trainset, `b` missing-value columns in testset. `a` > `b`, or `a` < `b` """ imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=X) imputer.fit(timeout=self.enough_time) result = imputer.produce(inputs=X, timeout=self.enough_time).value # PART1: when `a` > `b` data2 = result.copy() data2["T3"] = X["T3"].copy( ) # only set this column to original column, with missing vlaues result2 = imputer.produce(inputs=data2, timeout=self.enough_time).value self.helper_impute_result_check(data2, result2) # PART2: when `a` < `b` imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=data2) imputer.fit(timeout=self.enough_time) result = imputer.produce(inputs=X, timeout=self.enough_time).value # data contains more missingvalue columns than data2, # the imputer should triger default impute method for the column that not is trained self.helper_impute_result_check(X, result) # PART3: trunk the data : sample wise imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=X) imputer.fit(timeout=self.enough_time) result = imputer.produce(inputs=X[0:20], timeout=self.enough_time).value self.helper_impute_result_check(X[0:20], result)
def test_init(self): imputer = MeanImputation(hyperparams=hp) self.assertEqual(imputer._has_finished, False) self.assertEqual(imputer._iterations_done, False)
def test_noMV(self): """ test on the dataset has no missing values """ imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=X) imputer.fit(timeout=self.enough_time) result = imputer.produce(inputs=X, timeout=self.enough_time).value # 1. check produce(): `result` contains no missing value result2 = imputer.produce(inputs=result, timeout=self.enough_time).value self.assertEqual(result.equals(result2), True) # 2. check fit() & get_params() try fit on no-missing-value dataset imputer2 = MeanImputation(hyperparams=hp) imputer2.set_training_data(inputs=result) imputer2.fit(timeout=self.enough_time)
print(trainData.head()) print(trainTargets.head()) print(np.asarray(trainTargets['Class'])) print(testData.head()) hp = EncHyperparameter.sample() enc = Encoder(hyperparams=hp) enc.set_training_data(inputs=trainData) enc.fit() encodedData = enc.produce(inputs=trainData).value encodedTestData = enc.produce(inputs=testData).value # Initialize the DSBox imputer hp = MeanHyperparameter.sample() imputer = MeanImputation(hyperparams=hp) imputer.set_training_data(inputs=encodedData) # unsupervised imputer.fit(timeout=100) # give 100 seconds to fit print("\nParams:") print(imputer.get_params()) imputer2 = MeanImputation(hyperparams=hp) imputer2.set_params(params=imputer.get_params()) imputedData = imputer2.produce(inputs=encodedData, timeout=100).value model = BaggingClassifier() trainedModel = model.fit(imputedData, np.asarray(trainTargets['Class'])) predictedTargets = trainedModel.predict( imputer.produce(inputs=encodedTestData).value)
def setUp(self): imputer = MeanImputation(hyperparams=hp) self.enough_time = 100 self.not_enough_time = 0.000001
testData = pd.read_csv(path.join(dataRoot, 'testData.csv')) print(trainData.head()) print(trainTargets.head()) print(np.asarray(trainTargets['Class'])) print(testData.head()) enc = Encoder() enc.set_training_data(inputs=trainData) enc.set_params(params=Params(n_limit=10, text2int=True)) enc.fit() encodedData = enc.produce(inputs=trainData) encodedTestData = enc.produce(inputs=testData) # Initialize the DSBox imputer imputer = MeanImputation() imputer.set_params(verbose=0) imputer.set_training_data(inputs=encodedData) # unsupervised imputer.fit(timeout=10) # give 10 seconds to fit print(imputer.get_call_metadata()) # to see wether fit worked imputedData = imputer.produce(inputs=encodedData, timeout=10) print(imputer.get_call_metadata()) # to see wether produce worked model = BaggingClassifier() trainedModel = model.fit(imputedData, np.asarray(trainTargets['Class'])) predictedTargets = trainedModel.predict( imputer.produce(inputs=encodedTestData)) print(predictedTargets) # Outputs the predicted targets in the location specified in the JSON configuration file
""" sample program for classification problem """ def text2int(col): """ convert column value from text to integer codes (0,1,2...) """ return pd.DataFrame(col.astype('category').cat.codes,columns=[col.name]) import pandas as pd from dsbox.datapreprocessing.cleaner import MeanImputation # STEP 1: get data data_path = "../../dsbox-data/o_38/encoded/" data_name = data_path + "trainData_encoded.csv" label_name = data_path + "trainTargets_encoded.csv" # make sure your label target is in the second column of this file data = pd.read_csv(data_name) label = text2int(pd.read_csv(label_name)["Class"]) data.drop("d3mIndex",axis=1) # drop because id, useless # STEP 2: go to use the Imputer ! imputer = MeanImputation(verbose=1) result = imputer.produce(inputs=data, timeout=10) print (imputer.get_call_metadata()) # to see wether produce worked