示例#1
0
 def run(self):
     dfn = lambda row: (False, -1)
     clf = EvaluateCleaning(self.features, self.labels,
                            copy.copy(self.base_model))
     cleanClassifier, ypred, ytrue, yscores = clf.run(
         dfn, 'impute_mean', 'impute_mean')
     print "#####"
     print yscores
     self.logging.logResult(
         ["acc_noclean",
          get_acc_scores(ytrue, ypred, yscores)])
示例#2
0
 def run(self):
     mlist = [QuantitativeErrorModule]
     clist = [{'thresh': 10}]
     detector = ErrorDetector(self.features,
                              modules=mlist,
                              config=clist,
                              use_word2vec=False)
     detector.fit()
     dfn = detector.getDetectorFunction()
     clf = EvaluateCleaning(self.features, self.labels,
                            copy.copy(self.base_model))
     cleanClassifier, ypred, ytrue, yscores = clf.run(
         dfn, 'impute_mean', 'impute_mean')
     self.logging.logResult(
         ["acc_qclean", get_acc_scores(ytrue, ypred, yscores)])
示例#3
0
 def evaluateEnsembleAccuracy(self, roundNo, argmax):
     clf = EvaluateCleaning(self.features, self.labels,
                            copy.copy(self.base_model))
     X = clf.test_features
     ypred, scores = self.predict(X)
     ypred, ytrue, scores = normalize(ypred, argmax[0][2], scores, roundNo)
     return get_acc_scores(ytrue, ypred, scores)
示例#4
0
    def refitMax(self, argmax):
        mlist = [self.modules[argmax[1][0]]]
        clist = [self.config[argmax[1][0]]]

        if mlist[0] == "None":
            dfn = lambda row: (False, -1)
        else:
            detector = ErrorDetector(self.features,
                                     modules=mlist,
                                     config=clist)
            detector.addLogger(self.logging)
            detector.fit()
            dfn = detector.getDetectorFunction()

        clf = EvaluateCleaning(self.features, self.labels,
                               copy.copy(self.base_model))
        result = clf.run(dfn, argmax[0][0].train_action,
                         argmax[0][0].test_action)
        cur = accuracy_score(result[2], result[1])

        return (result, argmax[1], argmax[2]), cur
示例#5
0
    def run(self):
        mlist = [QuantitativeErrorModule, PuncErrorModule]
        clist = [{'thresh': 10}, {}]
        detector = ErrorDetector(self.features, modules=mlist, config=clist)
        detector.fit()
        dfn = detector.getDetectorFunction()
        clf = EvaluateCleaning(self.features, self.labels,
                               copy.copy(self.base_model))

        v, i = self.runRound(mlist, clist, set())

        self.logging.logResult(
            ["acc_bs", get_acc_scores(i[0][2], i[0][1], i[0][3])])
示例#6
0
    def runRound(self, avail_modules, avail_config, selected):

        trial = {}

        for i, module in enumerate(avail_modules):

            #print avail_modules

            if module == "None":

                dfn = lambda row: (False, -1)
                clf = EvaluateCleaning(self.features, self.labels,
                                       copy.copy(self.base_model))
                cleanClassifier, ypred, ytrue, yscores = clf.run(
                    dfn, 'impute_mean', 'impute_mean')
                trial[(i, 'impute_mean',
                       'impute_mean')] = (cleanClassifier, ypred, ytrue,
                                          yscores)

            else:
                mlist = [module]
                clist = [avail_config[i]]

                detector = ErrorDetector(self.features,
                                         modules=mlist,
                                         config=clist)
                detector.fit()
                dfn = detector.getDetectorFunction()

                for tr in CleanClassifier.avail_train:
                    for te in CleanClassifier.avail_test:

                        if (i, tr, te) in selected:
                            continue

                        clf = EvaluateCleaning(self.features, self.labels,
                                               copy.copy(self.base_model))
                        cleanClassifier, ypred, ytrue, yscores = clf.run(
                            dfn, tr, te)
                        trial[(i, tr, te)] = (cleanClassifier, ypred, ytrue,
                                              yscores)

        argmin = None
        minv = 1

        for k in trial:

            arg = (trial[k], k, avail_modules)
            cur = accuracy_score(trial[k][2], trial[k][1])

            if minv >= cur:
                argmin = arg
                minv = cur

        self.logging.logResult(
            ["cleaner_ws", str(avail_modules[argmin[1][0]])])

        return minv, argmin
示例#7
0
    def runRound(self, avail_modules, avail_config, selected,
                 materialized_cache):

        trial = {}

        for i, module in enumerate(avail_modules):

            #print avail_modules

            if module == "None":

                dfn = lambda row: (False, -1)

                if self.materialize and ((i, 'impute_mean', 'impute_mean')
                                         in materialized_cache):
                    trial[(i, 'impute_mean',
                           'impute_mean')] = materialized_cache[(
                               i, 'impute_mean', 'impute_mean')]
                else:
                    clf = EvaluateCleaning(self.features, self.labels,
                                           copy.copy(self.base_model))
                    cleanClassifier, ypred, ytrue, yscores = clf.run(
                        dfn, 'impute_mean', 'impute_mean')
                    materialized_cache[(i, 'impute_mean',
                                        'impute_mean')] = (cleanClassifier,
                                                           ypred, ytrue,
                                                           yscores)
                    trial[(i, 'impute_mean',
                           'impute_mean')] = (cleanClassifier, ypred, ytrue,
                                              yscores)

                #if the weights are none initialize
                if self.weights == None:
                    self.weights = np.ones((len(ypred), 1)) / len(ypred)

            else:
                mlist = [module]
                clist = [avail_config[i]]

                if self.dfnmemo and (i in self.dfn_cache):
                    dfn = self.dfn_cache[i]
                else:
                    detector = ErrorDetector(self.features,
                                             modules=mlist,
                                             config=clist)
                    detector.addLogger(self.logging)
                    detector.fit()
                    dfn = detector.getDetectorFunction()
                    self.dfn_cache[i] = dfn

                for tr in CleanClassifier.avail_train:
                    for te in CleanClassifier.avail_test:

                        if self.materialize and ((i, tr, te)
                                                 in materialized_cache):
                            trial[(i, tr, te)] = materialized_cache[(i, tr,
                                                                     te)]
                            continue

                        clf = EvaluateCleaning(self.features, self.labels,
                                               copy.copy(self.base_model))
                        cleanClassifier, ypred, ytrue, yscores = clf.run(
                            dfn, tr, te)
                        materialized_cache[(i, tr,
                                            te)] = (cleanClassifier, ypred,
                                                    ytrue, yscores)
                        trial[(i, tr, te)] = (cleanClassifier, ypred, ytrue,
                                              yscores)

                        #if the weights are none initialize
                        if self.weights == None:
                            self.weights = np.ones(
                                (len(ypred), 1)) / len(ypred)

        argmax = None
        maxv = 0

        for k in trial:

            arg = (trial[k], k, avail_modules)
            cur = accuracy_score(trial[k][2],
                                 trial[k][1],
                                 sample_weight=np.asarray(
                                     self.weights).reshape(-1))

            if maxv < cur:
                argmax = arg
                maxv = cur

        argmax, maxv = self.refitMax(argmax)

        print 'bc', maxv

        return maxv, argmax, materialized_cache