def run(self): dfn = lambda row: (False, -1) clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, 'impute_mean', 'impute_mean') print "#####" print yscores self.logging.logResult( ["acc_noclean", get_acc_scores(ytrue, ypred, yscores)])
def run(self): mlist = [QuantitativeErrorModule] clist = [{'thresh': 10}] detector = ErrorDetector(self.features, modules=mlist, config=clist, use_word2vec=False) detector.fit() dfn = detector.getDetectorFunction() clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, 'impute_mean', 'impute_mean') self.logging.logResult( ["acc_qclean", get_acc_scores(ytrue, ypred, yscores)])
def evaluateEnsembleAccuracy(self, roundNo, argmax): clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) X = clf.test_features ypred, scores = self.predict(X) ypred, ytrue, scores = normalize(ypred, argmax[0][2], scores, roundNo) return get_acc_scores(ytrue, ypred, scores)
def refitMax(self, argmax): mlist = [self.modules[argmax[1][0]]] clist = [self.config[argmax[1][0]]] if mlist[0] == "None": dfn = lambda row: (False, -1) else: detector = ErrorDetector(self.features, modules=mlist, config=clist) detector.addLogger(self.logging) detector.fit() dfn = detector.getDetectorFunction() clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) result = clf.run(dfn, argmax[0][0].train_action, argmax[0][0].test_action) cur = accuracy_score(result[2], result[1]) return (result, argmax[1], argmax[2]), cur
def run(self): mlist = [QuantitativeErrorModule, PuncErrorModule] clist = [{'thresh': 10}, {}] detector = ErrorDetector(self.features, modules=mlist, config=clist) detector.fit() dfn = detector.getDetectorFunction() clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) v, i = self.runRound(mlist, clist, set()) self.logging.logResult( ["acc_bs", get_acc_scores(i[0][2], i[0][1], i[0][3])])
def runRound(self, avail_modules, avail_config, selected): trial = {} for i, module in enumerate(avail_modules): #print avail_modules if module == "None": dfn = lambda row: (False, -1) clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, 'impute_mean', 'impute_mean') trial[(i, 'impute_mean', 'impute_mean')] = (cleanClassifier, ypred, ytrue, yscores) else: mlist = [module] clist = [avail_config[i]] detector = ErrorDetector(self.features, modules=mlist, config=clist) detector.fit() dfn = detector.getDetectorFunction() for tr in CleanClassifier.avail_train: for te in CleanClassifier.avail_test: if (i, tr, te) in selected: continue clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, tr, te) trial[(i, tr, te)] = (cleanClassifier, ypred, ytrue, yscores) argmin = None minv = 1 for k in trial: arg = (trial[k], k, avail_modules) cur = accuracy_score(trial[k][2], trial[k][1]) if minv >= cur: argmin = arg minv = cur self.logging.logResult( ["cleaner_ws", str(avail_modules[argmin[1][0]])]) return minv, argmin
def runRound(self, avail_modules, avail_config, selected, materialized_cache): trial = {} for i, module in enumerate(avail_modules): #print avail_modules if module == "None": dfn = lambda row: (False, -1) if self.materialize and ((i, 'impute_mean', 'impute_mean') in materialized_cache): trial[(i, 'impute_mean', 'impute_mean')] = materialized_cache[( i, 'impute_mean', 'impute_mean')] else: clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, 'impute_mean', 'impute_mean') materialized_cache[(i, 'impute_mean', 'impute_mean')] = (cleanClassifier, ypred, ytrue, yscores) trial[(i, 'impute_mean', 'impute_mean')] = (cleanClassifier, ypred, ytrue, yscores) #if the weights are none initialize if self.weights == None: self.weights = np.ones((len(ypred), 1)) / len(ypred) else: mlist = [module] clist = [avail_config[i]] if self.dfnmemo and (i in self.dfn_cache): dfn = self.dfn_cache[i] else: detector = ErrorDetector(self.features, modules=mlist, config=clist) detector.addLogger(self.logging) detector.fit() dfn = detector.getDetectorFunction() self.dfn_cache[i] = dfn for tr in CleanClassifier.avail_train: for te in CleanClassifier.avail_test: if self.materialize and ((i, tr, te) in materialized_cache): trial[(i, tr, te)] = materialized_cache[(i, tr, te)] continue clf = EvaluateCleaning(self.features, self.labels, copy.copy(self.base_model)) cleanClassifier, ypred, ytrue, yscores = clf.run( dfn, tr, te) materialized_cache[(i, tr, te)] = (cleanClassifier, ypred, ytrue, yscores) trial[(i, tr, te)] = (cleanClassifier, ypred, ytrue, yscores) #if the weights are none initialize if self.weights == None: self.weights = np.ones( (len(ypred), 1)) / len(ypred) argmax = None maxv = 0 for k in trial: arg = (trial[k], k, avail_modules) cur = accuracy_score(trial[k][2], trial[k][1], sample_weight=np.asarray( self.weights).reshape(-1)) if maxv < cur: argmax = arg maxv = cur argmax, maxv = self.refitMax(argmax) print 'bc', maxv return maxv, argmax, materialized_cache