예제 #1
0
    def __call__(self, data, weight=None):
        """Creates a Bayes model from the data in origTrainingData. """
        if not AZBaseClasses.AZLearner.__call__(self, data, weight):
            return None
        if data.domain.classVar.varType != orange.VarTypes.Discrete:
            raise Exception(
                "AZorngCvBayes can only be used for classification.")
        #Remove from the domain any unused values of discrete attributes including class
        data = dataUtilities.getDataWithoutUnusedValues(data, True)

        #dataUtilities.rmAllMeta(data)
        if len(data.domain.getmetas()) == 0:
            trainingData = data
        else:
            trainingData = dataUtilities.getCopyWithoutMeta(data)
        # Create the imputer
        self.imputer = orange.ImputerConstructor_average(trainingData)
        # Impute the data
        trainingData = self.imputer(trainingData)
        if self.scale:
            self.scalizer = dataUtilities.scalizer()
            self.scalizer.scaleClass = False
            self.scalizer.nMin = -1
            self.scalizer.nMax = 1
            self.trainData = self.scalizer.scaleAndContinuizeData(trainingData)
        else:
            self.trainData = trainingData
            self.scalizer = None

        impData = self.imputer.defaults
        #Convert the ExampleTable to CvMat
        CvMatrices = dataUtilities.ExampleTable2CvMat(self.trainData)
        mat = CvMatrices["matrix"]
        responses = CvMatrices["responses"]
        varTypes = CvMatrices["varTypes"]
        missingDataMask = CvMatrices["missing_data_mask"]

        #Create the model it MUST be created with the NON DEFAULT constructor or must call create
        classifier = ml.CvNormalBayesClassifier()
        classifier.clear()
        #Train the model
        #CvNormalBayesClassifier::train(const CvMat* _train_data, const CvMat* _responses, const CvMat* _var_idx =0, const CvMat* _sample_idx=0, bool update=false)
        classifier.train(mat, responses, None, None, False)
        return CvBayesClassifier(classifier=classifier,
                                 classVar=trainingData.domain.classVar,
                                 imputeData=impData,
                                 verbose=self.verbose,
                                 varNames=CvMatrices["varNames"],
                                 nIter=None,
                                 basicStat=self.basicStat,
                                 NTrainEx=len(trainingData),
                                 scalizer=self.scalizer,
                                 parameters=self.parameters)
예제 #2
0
def getMahalanobisResults(predictor,
                          invCovMatFile=None,
                          centerFile=None,
                          dataTableFile=None):
    domain = None
    if predictor.highConf == None and predictor.lowConf == None:
        return None, None
    if not dataTableFile and (not hasattr(predictor, "trainDataPath")
                              or not predictor.trainDataPath):
        print "The predictor does not have a trainDataPath specifyed. We need it for calculating Mahalanobis results!"
        return None, None
    testData = dataUtilities.attributeDeselectionData(predictor.exToPred,
                                                      ["SMILEStoPred"])
    if not dataTableFile:
        trainData = dataUtilities.DataTable(predictor.trainDataPath)
        domain = trainData.domain
    else:
        trainData = None
        domain = predictor.model.domain
    ExampleFix = dataUtilities.ExFix(domain, None, False)
    exFixed1 = ExampleFix.fixExample(testData[0])
    if testData.hasMissingValues():
        if not trainData:
            averageImputer = orange.Imputer_defaults(
                predictor.model.imputeData)
        else:
            averageImputer = orange.ImputerConstructor_average(trainData)
        dat = averageImputer(exFixed1)
    else:
        dat = exFixed1

    tab = dataUtilities.DataTable(domain)
    tab.append(dat)

    MD = calcMahalanobis(trainData, tab, invCovMatFile, centerFile,
                         dataTableFile, domain)
    near3neighbors = [(MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]),
                      (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]),
                      (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"])]
    avg3nearest = MD[0]["_train_av3nearest"]
    if avg3nearest < predictor.highConf:
        confStr = predictor.highConfString
    elif avg3nearest > predictor.lowConf:
        confStr = predictor.lowConfString
    else:
        confStr = predictor.medConfString

    return near3neighbors, confStr
예제 #3
0
파일: orngLR.py 프로젝트: stefie10/slu_hri
    def __call__(self, examples, weight=0):
        # next function changes data set to a extended with unknown values
        def createLogRegExampleTable(data, weightID):
            setsOfData = []
            for at in data.domain.attributes:
                # za vsak atribut kreiraj nov newExampleTable newData
                # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable
                if at.varType == orange.VarTypes.Continuous:
                    atDisc = orange.FloatVariable(at.name + "Disc")
                    newDomain = orange.Domain(data.domain.attributes +
                                              [atDisc, data.domain.classVar])
                    newDomain.addmetas(data.domain.getmetas())
                    newData = orange.ExampleTable(newDomain, data)
                    altData = orange.ExampleTable(newDomain, data)
                    for i, d in enumerate(newData):
                        d[atDisc] = 0
                        d[weightID] = 1 * data[i][weightID]
                    for i, d in enumerate(altData):
                        d[atDisc] = 1
                        d[at] = 0
                        d[weightID] = 0.000001 * data[i][weightID]
                elif at.varType == orange.VarTypes.Discrete:
                    # v dataOrig, dataFinal in newData atributu "at" dodaj ee  eno  vreednost, ki ima vrednost kar  ime atributa +  "X"
                    atNew = orange.EnumVariable(at.name,
                                                values=at.values +
                                                [at.name + "X"])
                    newDomain = orange.Domain(
                        filter(lambda x: x != at, data.domain.attributes) +
                        [atNew, data.domain.classVar])
                    newDomain.addmetas(data.domain.getmetas())
                    newData = orange.ExampleTable(newDomain, data)
                    altData = orange.ExampleTable(newDomain, data)
                    for i, d in enumerate(newData):
                        d[atNew] = data[i][at]
                        d[weightID] = 1 * data[i][weightID]
                    for i, d in enumerate(altData):
                        d[atNew] = at.name + "X"
                        d[weightID] = 0.000001 * data[i][weightID]
                newData.extend(altData)
                setsOfData.append(newData)
            return setsOfData

        learner = LogRegLearner(imputer=orange.ImputerConstructor_average(),
                                removeSingular=self.removeSingular)
        # get Original Model
        orig_model = learner(examples, weight)
        if orig_model.fit_status:
            print "Warning: model did not converge"

        # get extended Model (you should not change data)
        if weight == 0:
            weight = orange.newmetaid()
            examples.addMetaAttribute(weight, 1.0)
        extended_set_of_examples = createLogRegExampleTable(examples, weight)
        extended_models = [learner(extended_examples, weight) \
                           for extended_examples in extended_set_of_examples]

        ##        print examples[0]
        ##        printOUT(orig_model)
        ##        print orig_model.domain
        ##        print orig_model.beta
        ##        print orig_model.beta[orig_model.continuizedDomain.attributes[-1]]
        ##        for i,m in enumerate(extended_models):
        ##            print examples.domain.attributes[i]
        ##            printOUT(m)

        # izracunas odstopanja
        # get sum of all betas
        beta = 0
        betas_ap = []
        for m in extended_models:
            beta_add = m.beta[m.continuizedDomain.attributes[-1]]
            betas_ap.append(beta_add)
            beta = beta + beta_add

        # substract it from intercept
        #print "beta", beta
        logistic_prior = orig_model.beta[0] + beta

        # compare it to bayes prior
        bayes = orange.BayesLearner(examples)
        bayes_prior = math.log(bayes.distribution[1] / bayes.distribution[0])

        # normalize errors
        ##        print "bayes", bayes_prior
        ##        print "lr", orig_model.beta[0]
        ##        print "lr2", logistic_prior
        ##        print "dist", orange.Distribution(examples.domain.classVar,examples)
        ##        print "prej", betas_ap

        # error normalization - to avoid errors due to assumption of independence of unknown values
        dif = bayes_prior - logistic_prior
        positives = sum(filter(lambda x: x >= 0, betas_ap))
        negatives = -sum(filter(lambda x: x < 0, betas_ap))
        if not negatives == 0:
            kPN = positives / negatives
            diffNegatives = dif / (1 + kPN)
            diffPositives = kPN * diffNegatives
            kNegatives = (negatives - diffNegatives) / negatives
            kPositives = positives / (positives - diffPositives)
            ##        print kNegatives
            ##        print kPositives

            for i, b in enumerate(betas_ap):
                if b < 0: betas_ap[i] *= kNegatives
                else: betas_ap[i] *= kPositives
        #print "potem", betas_ap

        # vrni originalni model in pripadajoce apriorne niclele
        return (orig_model, betas_ap)
예제 #4
0
print "Example w/ missing values"
print data[19]
print "Imputed:"
print imputer(data[19])
print

impdata = imputer(data)
for i in range(20, 25):
    print data[i]
    print impdata[i]
    print


print "\n*** IMPUTING AVERAGE/MAJORITY VALUES ***\n"

imputer = orange.ImputerConstructor_average(data)

print "Example w/ missing values"
print data[19]
print "Imputed:"
print imputer(data[19])
print

impdata = imputer(data)
for i in range(20, 25):
    print data[i]
    print impdata[i]
    print


print "\n*** MANUALLY CONSTRUCTED IMPUTER ***\n"
예제 #5
0
    def test_Impute(self):
        """Test missing values imputation
        Assure that imputation works for the svm models. Test on data with missing values
        """
        ex1 = self.contTest[1]
        ex2 = self.contTest[6]
        self.assert_(ex1["DiscAttr2"] != "?",
                     "The var DiscAttr2 shouldn't be missing!")
        self.assert_(ex2["Level"] != "?",
                     "The var Level shouldn't be missing!")

        imputer = orange.ImputerConstructor_average(self.contTrain)
        svmL = AZorngCvSVM.CvSVMLearner(
            p=0.2
        )  #Here if p=2 and scaleClass is False, is ok, but with p=2 and also scale calss, the model will have no support vectors. So, with p=0.2 and also scale class, it goes right.
        svmL.svm_type = 103
        svm = svmL(self.contTrain)

        # Prediction for data as it is
        P1 = svm(ex1)
        P2 = svm(ex2)

        # Predictions changing one continuous and one discrete variable to 0
        ex1["DiscAttr2"] = 0
        ex2["Level"] = 0
        P1_0 = svm(ex1)
        P2_0 = svm(ex2)

        # Predictions changing the same continuous and discrete variable to it's correspondent imputation value
        ex1["DiscAttr2"] = imputer.defaults["DiscAttr2"]
        ex2["Level"] = imputer.defaults["Level"]
        P1_imp = svm(ex1)
        P2_imp = svm(ex2)

        # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation
        # as in the last case will have to be made inside the classifier. So, the predicted value must be the same
        ex1["DiscAttr2"] = "?"
        ex2["Level"] = "?"
        self.assert_(ex1["DiscAttr2"] == "?",
                     "The var DiscAttr2 should be missing now!")
        self.assert_(ex2["Level"] == "?",
                     "The var Level should be missing now!")

        P1Miss = svm(ex1)
        P2Miss = svm(ex2)

        # Test if the prediction made for the example with mising value is the same as the one
        # for the example which missing values were substituted using the same method as the classifier does.
        self.assert_(
            round(P1_imp, 4) == round(P1Miss, 4),
            "Imputation was not made correctly inside the classifier")
        self.assert_(
            round(P2_imp, 4) == round(P2Miss, 4),
            "Imputation was not made correctly inside the classifier")

        # Assure that if other substitutions on those variables were made, the predicted value would be different,
        # and so, this is a valid method for testing the imputation
        self.assert_(round(P1.value, 4) != round(
            P2.value,
            4))  # Just to assure that we are not comaring equal examples
        self.assert_(round(P1.value, 4) != round(P1_imp.value, 4))
        self.assert_(round(P1_0.value, 4) != round(P1_imp.value, 4))
        self.assert_(round(P2.value, 4) != round(P2_imp.value, 4))
        self.assert_(round(P2_0.value, 4) != round(P2_imp.value, 4))

        #Test the imputer for saved models
        # Save the model
        scratchdir = os.path.join(AZOC.SCRATCHDIR,
                                  "scratchdirSVMtest" + str(time.time()))
        os.mkdir(scratchdir)
        modelPath = os.path.join(scratchdir, "CvSVMModel")
        svm.write(modelPath)

        # Read in the model
        svmM = AZorngCvSVM.CvSVMread(modelPath)
        # Predict the ex1 and ex2 which are still the examples with missing values '?'
        self.assert_(ex1["DiscAttr2"] == "?",
                     "Value of Var DiscAttr2 should be missing!")
        self.assert_(ex2["Level"] == "?",
                     "Value of Var Level should be missing!")
        self.assert_(
            round(svmM(ex1), 4) == round(P1Miss, 4),
            "Imputation on loaded model is not correct")
        self.assert_(
            round(svmM(ex2), 4) == round(P2Miss, 4),
            "Imputation on loaded model is not correct")
        # Remove the scratch directory
        os.system("/bin/rm -rf " + scratchdir)
예제 #6
0
    def __call__(self, data, weight = None):
        """Creates an SVM model from the data in origTrainingData. """
        if not AZBaseClasses.AZLearner.__call__(self, data, weight):
            if self.verbose > 0: print "Could not create base class instance"
            return None
        dataUtilities.verbose = self.verbose
        #Remove from the domain any unused values of discrete attributes including class
        data = dataUtilities.getDataWithoutUnusedValues(data,True)

        #dataUtilities.rmAllMeta(data) 
        if len(data.domain.getmetas()) == 0:
            trainingData = data
        else:
            trainingData = dataUtilities.getCopyWithoutMeta(data)
        # Create the imputer
        self.imputer = orange.ImputerConstructor_average(trainingData)
        # Impute the data 
        trainingData = self.imputer(trainingData)
        if self.scaleData:
            self.scalizer = dataUtilities.scalizer()
            for attr in ("nMin","nMax","nClassMin","nClassMax"):
                setattr(self.scalizer, attr, getattr(self, attr))
            #Only scale the class in regression. On classification, set scaleClass to False
            self.scalizer.scaleClass = self.scaleClass  and trainingData.domain.classVar.varType == orange.VarTypes.Continuous or False
            self.scalizer.nClassMin = self.nClassMin
            self.scalizer.nClassMax = self.nClassMax
            self.trainData = self.scalizer.scaleAndContinuizeData(trainingData)
        else:
            self.trainData = trainingData
            self.scalizer = None

        impData=self.imputer.defaults
        #Adjust the svm type according to the problem (regression or classification)
        if self.svm_type != 102:
            if trainingData.domain.classVar.varType == orange.VarTypes.Continuous:
                if self.svm_type in (100,101):
                    self.svm_type += 3
                    self.eps = self.epsR    #Regression eps
            else:
                if self.svm_type in (103,104):
                    self.svm_type -= 3
                    self.eps = self.epsC    #Classification eps
        #Convert the ExampleTable to CvMat
        CvMatices = dataUtilities.ExampleTable2CvMat(self.trainData)
        mat = CvMatices["matrix"]
        responses = CvMatices["responses"]
        varTypes = CvMatices["varTypes"]

        #Configure SVM self.params
        self.params = ml.CvSVMParams()
        self.params.svm_type = self.svm_type
        self.params.kernel_type = self.kernel_type
        self.params.degree = self.degree
        self.params.gamma = self.gamma
        self.params.coef0 = self.coef0
        self.params.C = self.C
        self.params.nu = self.nu
        self.params.p = self.p
        #Process the priors from a str, list or dict to  a valid list 
        priors = self.convertPriors(self.priors,trainingData.domain.classVar)
        if type(priors) == str: #If a string is returned, there was a failure, and it is the respective error mnessage.
            print priors
            return None

        if priors and self.params.svm_type != ml.CvSVM.C_SVC:
            priors = None
            if self.verbose > 0: print "WARNING: The priors will not have any effect. They can only be used with C_SVC SVM-Type."
        elif priors:
            priors = dataUtilities. List2CvMat(priors)

        self.params.class_weights = priors

        term_crit = cv.CvTermCriteria()
        term_crit.type = self.stopCrit #cv.CV_TERMCRIT_EPS  #  or CV_TERMCRIT_ITER
        term_crit.epsilon = self.eps           #Or use:  term_crit.max_iter = x
        term_crit.max_iter = self.maxIter           #Or use:  term_crit.max_iter = x
        self.params.term_crit =  term_crit

        #Create the model
        classifier = ml.CvSVM()
        #Train the model
        #train(trainData, responses, varIdx, SampleIdx, Params)
        classifier.train(mat,responses,None,None,self.params)
        if classifier.get_support_vector_count() < 1:
            print "WARNING: The number of support vectors is 0." 
            print "This could be becasue the margin between the hyper plane and the support vectors has become zero."
            print "Try to modify the parameters controlling the margin. "
            print "For example decrease C or p(regression only)."
            print "No SVM model returned!"
            return None
        else:
            return CvSVMClassifier(classifier = classifier, classVar = data.domain.classVar, scalizer = self.scalizer, imputeData=impData, verbose = self.verbose, varNames = CvMatices["varNames"], basicStat = self.basicStat, NTrainEx = len(trainingData), parameters = self.parameters)
예제 #7
0
    def test_BuiltIn_Impute(self):
        """Test RF BuiltIn missing values imputation
        Assure that imputation works for the rf models. Test on data with missing values
        """
        #This data is loaded here to speed up the test suite since it is too big
        contTestDataPath = os.path.join(AZOC.AZORANGEHOME,
                                        "tests/source/data/linearTest.tab")
        contTrainDataPath = os.path.join(AZOC.AZORANGEHOME,
                                         "tests/source/data/linearTrain.tab")
        contTrain = dataUtilities.DataTable(contTrainDataPath)
        contTest = dataUtilities.DataTable(contTestDataPath)

        ex1 = contTest[5]
        ex2 = contTest[2]
        AttrEx1 = "Desc 71"
        AttrEx2 = "Desc 72"
        self.assert_(ex1[AttrEx1] != "?",
                     "The var Desc 671 shouldn't be missing!")
        self.assert_(ex2[AttrEx2] != "?",
                     "The var Desc 138 shouldn't be missing!")

        imputer = orange.ImputerConstructor_average(contTrain)
        RFlearner = AZorngRF.RFLearner(NumThreads = 1, maxDepth = "20", minSample = "5", useSurrogates = "false", getVarVariance = "false", \
                                        nActVars = "0", nTrees = "100", forestAcc = "0.001", termCrit = "0",useBuiltInMissValHandling = True )
        rf = RFlearner(contTrain)

        # Prediction for data as it is
        P1 = rf(ex1)
        P2 = rf(ex2)

        # Predictions changing one continuous and one discrete variable to 0
        ex1[AttrEx1] = 0
        ex2[AttrEx2] = 0
        P1_0 = rf(ex1)
        P2_0 = rf(ex2)

        # Predictions changing the same continuous and discrete variable to it's correspondent imputation value
        #ex1["Desc 71"]=imputer.defaults["Desc 71"]
        #ex2["Desc 138"]=imputer.defaults["Desc 138"]
        #P1_imp=rf(ex1)
        #P2_imp=rf(ex2)

        # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation
        # as in the last case will have to be made inside the classifier. So, the predicted value must be the same
        ex1[AttrEx1] = "?"
        ex2[AttrEx2] = "?"
        self.assert_(ex1[AttrEx1] == "?",
                     "The var Desc 71 should be missing now!")
        self.assert_(ex2[AttrEx2] == "?",
                     "The var Desc 138 should be missing now!")
        P1Miss = rf(ex1)
        P2Miss = rf(ex2)

        # Test if the prediction made for the example with mising value is the same as the one
        # for the example which missing values were substituted using the same method as the classifier does.
        #self.assert_(P1_imp==P1Miss,"Imputation was not made correctly inside the classifier")
        #self.assert_(P2_imp==P2Miss,"Imputation was not made correctly inside the classifier")

        # Assure that if other substitutions on those variables were made, the predicted value would be different,
        # and so, this is a valid method for testing the imputation

        self.assert_(
            P1.value !=
            P2.value)  # Just to assure that we are not comaring equal examples
        self.assert_(
            P1.value != P1Miss.value,
            "The imputed 1 was the same as the original ... try other example")
        self.assert_(
            P1_0.value != P1Miss.value,
            "The imputed 1 was the same as the replaced by 0. The classifier may be replacing missing values by 0"
        )
        self.assert_(
            P2.value != P2Miss.value,
            "The missing imputed 2 was the same as the original ... try other example"
        )
        #self.assert_(P2_0.value!=P2Miss.value,"The missing imputed 2 was the same as the replaced by 0. The classifier may be replacing missing values by 0")

        self.assert_(rf.useBuiltInMissValHandling == True)
        #Test the imputer for saved models
        # Save the model
        scratchdir = os.path.join(AZOC.SCRATCHDIR,
                                  "scratchdirTest" + str(time.time()))
        os.mkdir(scratchdir)
        modelPath = os.path.join(scratchdir, "RFModel")
        rf.write(modelPath)

        # Read in the model
        rfM = AZorngRF.RFread(modelPath)
        self.assert_(rfM.useBuiltInMissValHandling == True)
        # Predict the ex1 and ex2 which are still the examples with missing values '?'
        self.assert_(ex1[AttrEx1] == "?",
                     "Value of Var Desc 6 should be missing!")
        self.assert_(ex2[AttrEx2] == "?",
                     "Value of Var Desc 71 should be missing!")
        self.assert_(
            rfM(ex1) == P1Miss, "Imputation on loaded model is not correct")
        self.assert_(
            rfM(ex2) == P2Miss, "Imputation on loaded model is not correct")
        # Remove the scratch directory
        os.system("/bin/rm -rf " + scratchdir)
예제 #8
0
def calcMahalanobis(data,
                    testData,
                    invCovMatFile=None,
                    centerFile=None,
                    dataTableFile=None,
                    domain=None,
                    nNN=NO_OF_NEIGHBORS):
    """
    Calculates Mahalanobis distances.
    The data should only contain attributes that are relevant for similarity. OBS data is assumed to have a response variable.
    data - X matrix used to calculate the covariance matrix
    testData - the examples in an ExampleTable object for which to calculate the MDs
    Returns a list of Mahalanobis distances between the examples in testData and training data.
    The elements of the list are dictionaries, giving the Mahalanobis distances to the average (_MD), the nearest neighbor and 
    an average of the 3 nearest neighbors (_train_av3nearest). 
    """

    # Impute any missing valuesi
    if data:
        averageImputer = orange.ImputerConstructor_average(data)
        data = averageImputer(data)
    #If Class is continuous and all class values are unknown (and they usually are in ex to predict), the imputer cannot be created.
    # Since we are only using attributes, not the class, we will assign 0 to the class values in order to impute the testData
    if testData.domain.classVar and testData.domain.classVar.varType == orange.VarTypes.Continuous:
        for ex in testData:
            if ex.getclass().isSpecial():
                ex.setclass(0)
    # This can also happens when calculating a single example with missing attributes
    try:
        averageImputer = orange.ImputerConstructor_average(testData)
    except:
        for ex in testData:
            for attr in [
                    a for a in testData.domain.attributes
                    if a.varType == orange.VarTypes.Continuous
            ]:
                if ex[attr].isSpecial():
                    ex[attr] = 0
        averageImputer = orange.ImputerConstructor_average(testData)

    testData = averageImputer(testData)

    #Test if there is any non-numeric value within the dataset
    for ex in testData:
        #It is much faster to address the ex elements by their position instead of the correpondent name
        for idx in range(len(ex.domain.attributes)):
            if not miscUtilities.isNumber(ex[idx].value):
                raise Exception("Cannot calculate Mahalanobis distances. The attribute '" + \
                      ex.domain.attributes[idx].name + "' has non-numeric values. Ex: " + \
                      str(ex[idx].value))
    if data:
        trainingSet = getTrainingSet(data)
        trainingset_descriptor_names = trainingSet.descr_names
    else:
        trainingSet = None
        trainingset_descriptor_names = [
            attr.name for attr in domain.attributes
        ]
    mahalanobisCalculator = Mahalanobis.MahalanobisDistanceCalculator(
        trainingSet, invCovMatFile, centerFile, dataTableFile)
    MDlist = []
    for ex in testData:
        # Create a numeric vector from the example and assure the same order as in trainingset_descriptor_names
        descriptor_values = []
        for name in trainingset_descriptor_names:
            try:
                descriptor_values.append(float(ex[name].value))
            except:
                raise Exception(
                    "Not possible to calculate Mahalanobis distances. Some attribute is not numeric."
                )

        #descriptor_values = [1.5] * len(trainingset_descriptor_names)
        MD = mahalanobisCalculator.calculateDistances(descriptor_values, nNN)
        MDlist.append(MD)
    return MDlist
예제 #9
0
    def __call__(self, data, weight=None):
        """Creates a Boost model from the data in origTrainingData. """
        if not AZBaseClasses.AZLearner.__call__(self, data, weight):
            return None
        if data.domain.classVar.varType != orange.VarTypes.Discrete:
            print "AZorngCvBoost can only be used for binary classification."
            return None
        #Remove from the domain any unused values of discrete attributes including class
        data = dataUtilities.getDataWithoutUnusedValues(data, True)

        #dataUtilities.rmAllMeta(data)
        if len(data.domain.getmetas()) == 0:
            trainingData = data
        else:
            trainingData = dataUtilities.getCopyWithoutMeta(data)
        # Create the imputer
        self.imputer = orange.ImputerConstructor_average(trainingData)
        # Impute the data
        self.trainData = self.imputer(trainingData)

        impData = self.imputer.defaults
        #Convert the ExampleTable to CvMat
        CvMatrices = dataUtilities.ExampleTable2CvMat(self.trainData)
        mat = CvMatrices["matrix"]
        responses = CvMatrices["responses"]
        varTypes = CvMatrices["varTypes"]
        missingDataMask = CvMatrices["missing_data_mask"]

        #Configure Boost params
        #First, Correct any wrong parameters Combination:
        #   CVBOOSTTYPE = { "DISCRETE":0, "REAL":1, "LOGIT":2, "GENTLE":3 }
        #   CVBOOSTSPLITCRIT = { "DEFAULT":0, "GINI":1, "MISCLASS":3, "SQERR":4 }
        if self.boost_type not in AZOC.CVBOOSTTYPE:
            print "ERROR: Bad value for parameter boost_type. Possible values: " + string.join(
                [x for x in AZOC.CVBOOSTTYPE], ", ")
            return None
        if self.split_criteria not in AZOC.CVBOOSTSPLITCRIT:
            print "ERROR: Bad value for parameter split_criteria. Possible values: " + string.join(
                [x for x in AZOC.AZOC.CVBOOSTSPLITCRIT], ", ")
            return None

        if self.boost_type == "DISCRETE":
            if self.split_criteria not in ["MISCLASS", "GINI"]:
                print "WARNING: For Discrete type, the split Criteria must be MISCLASS or GINI. MISCLASS was used by default."
                self.split_criteria = "MISCLASS"
        if self.boost_type == "REAL":
            if self.split_criteria not in ["MISCLASS", "GINI"]:
                print "WARNING: For REAL type, the split Criteria must be MISCLASS or GINI. GINI was used by default."
                self.split_criteria = "GINI"
        if self.boost_type in ["LOGIT", "GENTLE"]:
            if self.split_criteria != "SQERR":
                print "WARNING: For LOGIT and GENTLE types, the split Criteria must be SQERR. SQERR was used by default."
                self.split_criteria = "SQERR"

        params = ml.CvBoostParams()
        params.boost_type = AZOC.CVBOOSTTYPE[self.boost_type]
        params.split_criteria = AZOC.CVBOOSTSPLITCRIT[self.split_criteria]
        params.weak_count = self.weak_count
        params.weight_trim_rate = self.weight_trim_rate
        params.max_depth = self.max_depth
        params.use_surrogates = self.use_surrogates

        #Create the model it MUST be created with the NON DEFAULT constructor or must call create
        classifier = ml.CvBoost()
        #Train the model
        #train(const CvMat* _train_data, int _tflag, const CvMat* _responses, const CvMat* _var_idx=0, const CvMat* _sample_idx=0, const CvMat* _var_type=0, const CvMat* _missing_mask=0, CvBoostParams params=CvBoostParams(), bool update=false)
        #sampleWeights = cv.cvCreateMat(1,len(self.trainData),cv.CV_32FC1)
        #cv.cvSet(sampleWeights,1.0)

        #compute priors (sample weights)
        priors = self.convertPriors(self.priors,
                                    self.trainData.domain.classVar)
        if type(
                priors
        ) == str:  #If a string is returned, there was a failure, and it is the respective error mnessage.
            print priors
            return None
        #Train the model
        if self.verbose: self.printParams(params)
        classifier.train(mat, ml.CV_ROW_SAMPLE, responses, None, None,
                         varTypes, missingDataMask, params, False,
                         priors and str(priors).replace(",", " ") or None)
        return CvBoostClassifier(classifier=classifier,
                                 classVar=self.trainData.domain.classVar,
                                 imputeData=impData,
                                 verbose=self.verbose,
                                 varNames=CvMatrices["varNames"],
                                 nIter=None,
                                 basicStat=self.basicStat,
                                 NTrainEx=len(trainingData),
                                 parameters=self.parameters)
예제 #10
0
    def __call__(self, trainingData, weight = None):
        """Creates an RF model from the data in trainingData. """
        if not AZBaseClasses.AZLearner.__call__(self,trainingData, weight):
            return None

        # Set the number of theatd to be used ny opencv
        cv.cvSetNumThreads(max(int(self.NumThreads),0))
        #Remove from the domain any unused values of discrete attributes including class
        trainingData = dataUtilities.getDataWithoutUnusedValues(trainingData,True)

        # Object holding the data req for predictions (model, domain, etc)
	#print time.asctime(), "=superRFmodel(trainingData.domain)"
        ##scPA
        # Remove meta attributes from training data
        #dataUtilities.rmAllMeta(trainingData)
        if len(trainingData.domain.getmetas()) == 0:
            trainData = trainingData
        else:
            trainData = dataUtilities.getCopyWithoutMeta(trainingData)
        # Impute the data and Convert the ExampleTable to CvMat 
        if self.useBuiltInMissValHandling:
            #Create the imputer empty since we will not be using it
            impData = dataUtilities.DataTable(trainData.domain)
            CvMatrices = dataUtilities.ExampleTable2CvMat(trainData)
        else:
            #Create the imputer
            self.imputer = orange.ImputerConstructor_average(trainData)
            impData=self.imputer.defaults
            trainData = self.imputer(trainData)
            CvMatrices = dataUtilities.ExampleTable2CvMat(trainData)
            CvMatrices["missing_data_mask"] = None
        ##ecPA
        self.learner = ml.CvRTrees()#superRFmodel(trainData.domain)    #This call creates a scratchDir

        # Set RF model parameter values
        #  when nActVars defined as 0, use the sqrt of number of attributes so the user knows what will be used
        # This would be done in the C level if left as 0
        if self.nActVars == "0" and len(trainData.domain.attributes)>0:
            self.nActVars =  str(int(sqrt(len(trainData.domain.attributes))))
	#print time.asctime(), "=self.setParameters"
        params = self.setParameters(trainData)
        # Print values of the parameters
        if self.verbose > 0: self.printOuts(params)
        #**************************************************************************************************//
        #                      Check for irrational input arguments
        #**************************************************************************************************//
        if params.min_sample_count >= len(trainingData):
            if self.verbose > 0: print "ERROR! Invalid minSample: ",params.min_sample_count
            if self.verbose > 0: print "minSample must be smaller than the number of examples."
            if self.verbose > 0: print "The number of examples is: ",len(trainingData)
            if len(trainingData) > 10:
                if self.verbose > 0: print "minSample assigned to default value: 10"
                params.min_sample_count = 10
            else:
                if self.verbose > 0: print "Too few examples!!"
                if self.verbose > 0: print "Terminating"
                if self.verbose > 0: print "No random forest model built"
                return None
        if params.nactive_vars > len(trainingData.domain.attributes):
            if self.verbose > 0: print "ERROR! Invalid nActVars: ",params.nactive_vars
            if self.verbose > 0: print "nActVars must be smaller than or equal to the number of variables."
            if self.verbose > 0: print "The number of variables is: ", len(trainingData.domain.attributes)
            if self.verbose > 0: print "nActVars assigned to default value: sqrt(nVars)=",sqrt(len(trainingData.domain.attributes))
            params.nactive_vars = 0;
        # Train RF model on data in openCVFile
	#print time.asctime(), "=Start Training"
        #Process the priors and Count the number of values in class var
        if  trainingData.domain.classVar.varType == orange.VarTypes.Discrete:
            cls_count = len(trainData.domain.classVar.values)
            priors = self.convertPriors(self.priors,trainingData.domain.classVar)
            if type(priors) == str: #If a string is returned, there was a failure, and it is the respective error mnessage.
                print priors
                return None 
        else:
            cls_count = 0
            priors = None
        # Call the train method
        self.learner.train( CvMatrices["matrix"],ml.CV_ROW_SAMPLE,CvMatrices["responses"],None,None,CvMatrices["varTypes"],CvMatrices["missing_data_mask"],params,cls_count,  priors and str(priors).replace(","," ") or None)
        if self.learner.get_var_importance():
            varImportanceList = self.learner.get_var_importance()
            varImportance = {}
            varName = []
            varImp = []
            for idx,attr in enumerate(CvMatrices["varNames"]):
                varImportance[attr] = varImportanceList[idx]
            #Uncomment next lines if needed the outpuit already ordered
            #============================= begin =================================
            #    varName.append(attr)
            #    varImp.append(varImportanceList[idx])
            #Order the vars in terms of importance
            # insertion sort algorithm
            #for i in range(1, len(varImp)):
            #    save = varImp[i]
            #    saveName = varName[i]
            #    j = i
            #    while j > 0 and varImp[j - 1] < save:
            #        varImp[j] = varImp[j - 1]
            #        varName[j] = varName[j - 1]
            #        j -= 1
            #    varImp[j] = save
            #    varName[j] = saveName
            #For debug: test if assign var importance was correct
            #for attr in varImportance:
            #    if varImportance[attr] != varImp[varName.index(attr)]:
            #        print "ERROR: Variable importance of ", attr, " is not correct!"
            #OrderedVarImportance = {"VarNames":varName, "VarImportance":varImp}
            #=============================  end  =================================
        else:
            varImportance = {}
        #print time.asctime(), "=Done"
        # Save info about the variables used in the model (used by the write method)
        #attributeInfo = dataUtilities.DataTable(trainData.domain)
        # place the impute data as the first example of this data
        #attributeInfo.append(self.imputer.defaults)
        return RFClassifier(classifier = self.learner, classVar = impData.domain.classVar, imputeData=impData, verbose = self.verbose, varNames = CvMatrices["varNames"],thisVer=True,useBuiltInMissValHandling = self.useBuiltInMissValHandling, varImportance = varImportance, basicStat = self.basicStat, NTrainEx = len(trainingData), parameters = self.parameters)
예제 #11
0
    def __call__(self, data, weight=None):
        bestSeed = None
        bestAcc = None
        bestNiter = None
        bestModel = None
        #fix self.nDiffIniWeights for the disabled mode
        if self.nDiffIniWeights <= 1:
            self.nDiffIniWeights = 1  #loop over n different initial weights Disabled
        #Fix self.stopUPs for the disabled mode
        if self.stopUPs <= 0:
            self.stopUPs = 0  # Optimization of nIter will be disabled

        self.NTrainEx = len(data)
        #Remove from the domain any unused values of discrete attributes including class
        data = dataUtilities.getDataWithoutUnusedValues(data, True)

        #dataUtilities.rmAllMeta(data)
        if len(data.domain.getmetas()) == 0:
            cleanedData = data
        else:
            cleanedData = dataUtilities.getCopyWithoutMeta(data)
        # Create the imputer
        self.imputer = orange.ImputerConstructor_average(cleanedData)
        # Impute the data
        self.trainData = self.imputer(cleanedData)
        # If we are not seetin neither weights init optimization or nEphocs optimization (opencvLayer), the do nto split the data
        if self.stopUPs != 0 or self.nDiffIniWeights > 1:
            #Define train-80% and validation set-20% of the input data
            indices = orange.MakeRandomIndices2(
                p0=0.2,
                stratified=orange.MakeRandomIndices.StratifiedIfPossible)
            ind = indices(cleanedData)
            self.trainData = cleanedData.select(ind, 1)
            validationSet = cleanedData.select(ind, 0)
        else:
            validationSet = None

        if self.verbose and self.nDiffIniWeights > 1:
            print "=========== Training ", self.nDiffIniWeights, " times with different initial weights =============="
        for n in range(self.nDiffIniWeights):
            if self.nDiffIniWeights <= 1:
                seed = 0  #in opencv  mmlann seed=0 means the seed is disabled, and original seed will be used
            else:
                seed = len(cleanedData) * len(cleanedData.domain) * (
                    n + 1)  #seed can be any integer
            #Create a model with a specific seed for training opencv ANN.
            #Also passing the step for the nIter optimization (self.stopUPs=0 - disable nIter optimization)
            #Also passing the validation set to be used in internal opencv implemented nEphocs optimization.
            model = self.__train__(weight=None,
                                   seed=seed,
                                   validationSet=validationSet)
            #Skip evaluation if the weights loop is disabled
            if self.nDiffIniWeights <= 1:
                return model
                break
            if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete:
                Acc = evalUtilities.getClassificationAccuracy(
                    validationSet, model)
            else:
                Acc = -evalUtilities.getRMSE(validationSet, model)
            if bestModel == None or (Acc > bestAcc) or (
                    Acc == bestAcc and model.nIter < bestNiter):
                bestSeed = seed
                bestAcc = Acc
                bestNiter = model.nIter
                bestModel = model
            if self.verbose:
                print "nIter:%-7s  Acc:%-20s  seed: %s" % (model.nIter, Acc,
                                                           seed)

        if self.verbose:
            print "================ Best model Found: ==================="
        if self.verbose:
            print "nIter:%-7s  Acc:%-20s  seed: %s" % (bestNiter, bestAcc,
                                                       bestSeed)

        # DEBUG for check if the returned model is indeed the best model, and not the last trainted
        #if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete:
        #    Acc = evalUtilities.getClassificationAccuracy(validationSet, bestModel)
        #else:
        #    Acc = -evalUtilities.getRMSE(validationSet, bestModel)
        #if self.verbose: print "================ Best model returned: ==================="
        #if self.verbose:  print "nIter:%-7s  Acc:%-20s  seed: %s" % (bestModel.nIter,Acc,bestModel.seed)

        return bestModel
예제 #12
0
    def no_test_Impute(self):  # Bayes cannot deal with regression
        """Test missing values imputation
        Assure that imputation works for the Bayes models. Test on data with missing values
        """
        #This data is loaded here to speed up the test suite since it is too big
        contTestDataPath = os.path.join(AZOC.AZORANGEHOME,
                                        "tests/source/data/linearTest.tab")
        contTrainDataPath = os.path.join(AZOC.AZORANGEHOME,
                                         "tests/source/data/linearTrain.tab")
        contTrain = dataUtilities.DataTable(contTrainDataPath)
        contTest = dataUtilities.DataTable(contTestDataPath)

        ex1 = contTest[5]
        ex2 = contTest[6]
        self.assert_(ex1["Desc 71"] != "?",
                     "The var Desc 71 shouldn't be missing!")
        self.assert_(ex2["Desc 138"] != "?",
                     "The var Desc 138 shouldn't be missing!")

        imputer = orange.ImputerConstructor_average(contTrain)

        CvBayeslearner = AZorngCvBayes.CvBayesLearner()
        Bayes = CvBayeslearner(contTrain)

        # Prediction for data as it is
        P1 = Bayes(ex1)
        P2 = Bayes(ex2)

        # Predictions changing one continuous and one discrete variable to 0
        ex1["Desc 71"] = 0
        ex2["Desc 138"] = 0
        P1_0 = Bayes(ex1)
        P2_0 = Bayes(ex2)

        # Predictions changing the same continuous and discrete variable to it's correspondent imputation value
        ex1["Desc 71"] = imputer.defaults["Desc 71"]
        ex2["Desc 138"] = imputer.defaults["Desc 138"]
        P1_imp = Bayes(ex1)
        P2_imp = Bayes(ex2)

        # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation
        # as in the last case will have to be made inside the classifier. So, the predicted value must be the same
        ex1["Desc 71"] = "?"
        ex2["Desc 138"] = "?"
        self.assert_(ex1["Desc 71"] == "?",
                     "The var Desc 71 should be missing now!")
        self.assert_(ex2["Desc 138"] == "?",
                     "The var Desc 138 should be missing now!")
        P1Miss = Bayes(ex1)
        P2Miss = Bayes(ex2)

        # Test if the prediction made for the example with mising value is the same as the one
        # for the example which missing values were substituted using the same method as the classifier does.
        self.assert_(
            P1_imp == P1Miss,
            "Imputation was not made correctly inside the classifier")
        #self.assert_(P2_imp==P2Miss,"Imputation was not made correctly inside the classifier")

        # Assure that if other substitutions on those variables were made, the predicted value would be different,
        # and so, this is a valid method for testing the imputation
        self.assert_(
            P1.value !=
            P2.value)  # Just to assure that we are not comaring equal examples
        self.assert_(
            P1.value != P1_imp.value,
            "The imputed 1 was the same as the original ... try other example")
        self.assert_(
            P1_0.value != P1_imp.value,
            "The imputed 1 was the same as the replaced by 0. The classifier may be replacing missing values by 0"
        )
        self.assert_(
            P2.value != P2Miss.value,
            "The missing imputed 2 was the same as the original ... try other example"
        )
        self.assert_(
            P2_0.value != P2Miss.value,
            "The missing imputed 2 was the same as the replaced by 0. The classifier may be replacing missing values by 0"
        )

        #Test the imputer for saved models
        # Save the model
        scratchdir = os.path.join(AZOC.SCRATCHDIR,
                                  "scratchdir" + str(time.time()))
        os.mkdir(scratchdir)
        modelPath = os.path.join(scratchdir, "CvBayesModel")
        Bayes.write(modelPath)

        # Read in the model
        BayesM = AZorngCvBayes.CvBayesread(modelPath)
        # Predict the ex1 and ex2 which are still the examples with missing values '?'
        self.assert_(ex1["Desc 71"] == "?",
                     "Value of Var Desc 71 should be missing!")
        self.assert_(ex2["Desc 138"] == "?",
                     "Value of Var Desc 138 should be missing!")
        self.assert_(
            round(BayesM(ex1), 6) == round(P1Miss, 6),
            "Imputation on loaded model is not correct")
        self.assert_(
            round(BayesM(ex2), 6) == round(P2Miss, 6),
            "Imputation on loaded model is not correct")
        # Remove the scratch directory
        os.system("/bin/rm -rf " + scratchdir)
예제 #13
0
    def testImpute(self):
        """Test missing values imputation
        Assure that imputation works for the pls models. Test on data with missing values
        """
        ex1 = self.trainImpData[0]
        ex2 = self.trainImpData[3]
        self.assert_(ex1["DiscAttr2"] != "?",
                     "The var DiscAttr2 shouldn't be missing!")
        self.assert_(ex2["Level"] != "?",
                     "The var Level shouldn't be missing!")

        imputer = orange.ImputerConstructor_average(self.trainImpData)

        pls = AZorngPLS.PLSLearner(self.trainImpData)

        # Prediction for data as it is
        P1 = pls(ex1)
        P2 = pls(ex2)

        # Predictions changing one continuous and one discrete variable to 0
        ex1["DiscAttr2"] = 0
        ex2["Level"] = 0
        P1_0 = pls(ex1)
        P2_0 = pls(ex2)

        # Predictions changing the same continuous and discrete variable to it's correspondent imputation value
        ex1["DiscAttr2"] = imputer.defaults["DiscAttr2"]
        ex2["Level"] = imputer.defaults["Level"]
        P1_imp = pls(ex1)
        P2_imp = pls(ex2)

        # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation
        # as in the last case will have to be made inside the classifier. So, the predicted value must be the same
        ex1["DiscAttr2"] = "?"
        ex2["Level"] = "?"
        self.assert_(ex1["DiscAttr2"] == "?",
                     "The var DiscAttr2 should be missing now!")
        self.assert_(ex2["Level"] == "?",
                     "The var Level should be missing now!")

        P1Miss = pls(ex1)
        P2Miss = pls(ex2)

        # Test if the prediction made for the example with mising value is the same as the one
        # for the example which missing values were substituted using the same method as the classifier does.
        self.assert_(
            P1_imp == P1Miss,
            "Imputation was not made correctly inside the classifier")
        self.assert_(
            P2_imp == P2Miss,
            "Imputation was not made correctly inside the classifier")

        # Assure that if other substitutions on those variables were made, the predicted value would be different,
        # and so, this is a valid method for testing the imputation
        self.assert_(
            P1.value !=
            P2.value)  # Just to assure that we are not comaring equal examples
        self.assert_(P1.value != P1_imp.value)
        self.assert_(P1_0.value != P1_imp.value)
        self.assert_(P2.value != P2_imp.value)
        self.assert_(P2_0.value != P2_imp.value)

        #Test the imputer for saved models
        # Save the model
        scratchdir = os.path.join(AZOC.SCRATCHDIR,
                                  "scratchdir" + str(time.time()))
        os.mkdir(scratchdir)
        modelPath = os.path.join(scratchdir, "PLSModel")
        pls.write(modelPath)

        # Read in the model
        plsM = AZorngPLS.PLSread(modelPath)
        # Predict the ex1 and ex2 which are still the examples with missing values '?'
        self.assert_(ex1["DiscAttr2"] == "?",
                     "Value of Var DiscAttr2 should be missing!")
        self.assert_(ex2["Level"] == "?",
                     "Value of Var Level should be missing!")
        self.assert_(
            plsM(ex1) == P1Miss, "Imputation on loaded model is not correct")
        self.assert_(
            plsM(ex2) == P2Miss, "Imputation on loaded model is not correct")
        # Remove the scratch directory
        os.system("/bin/rm -rf " + scratchdir)
예제 #14
0
def createInvCovMat(data,
                    TSDT_file=None,
                    C_file=None,
                    SQRTICM_file=None,
                    MTD_file=None):
    """
     Inputs:
        data            - The train data orange table
     Outputs (all in numpy format:  .npy)
        *Not used*  ICM_file        - Path to save the Invertec Covariance Matrix 
        TSDT_file       - Path to save the TrainSet Data Table  
        C_file          - Path to save the Center file 
        SQRTICM_file    - Path to save the Sqrt Inverted Covariance Matrix 
        MTD_file        - Path to save the Mahalanobis Transformed Data 
    """
    from AZutilities import similarityMetrics
    import orange

    if data.hasMissingValues():
        averageImputer = orange.ImputerConstructor_average(data)
        data = averageImputer(data)
    training_set = similarityMetrics.getTrainingSet(data)
    center = numpy.average(training_set.data_table, 0)

    # BackCompatibility ONLY. TODO: To Remove when mahalanobis is updated
    if SQRTICM_file:
        ICM_file = os.path.join(
            os.path.split(SQRTICM_file)[0], "invCovMatrix.npy")
        covarMat = numpy.cov(numpy.asarray(training_set.data_table), rowvar=0)
        inverse_covarMat = numpy.linalg.pinv(covarMat, rcond=1e-10)
        numpy.save(ICM_file, inverse_covarMat)

    if TSDT_file:
        numpy.save(TSDT_file, training_set.data_table)
    if C_file:
        numpy.save(C_file, center)

    # Next call to createMahalanobisData.sh is to be removed when the   sqrtm(CI) is working.
    #Thisis known to be working in python 2.7 - Numpy - scipy
    # not yet solve on module numpy/python2.7-64_1.6.0  (python/64_2.7.1)
    status, out = commands.getstatusoutput(
        "env -i $AZORANGEHOME/azorange/AZutilities/createMahalanobisData.sh " +
        TSDT_file + " " + C_file + " " + SQRTICM_file + " " + MTD_file)
    if status:
        print "Error running Advanced Files creator: " + str(out)
        return False
    else:
        return True

    #Code runned at the momment by createMahalanobisData.sh using another version of python
    #   (python2.7 locally installed in azorangeLive home directory)
    print "Creating advanced files"
    from scipy.linalg import sqrtm

    data = numpy.load(TSDT_file)
    center = numpy.load(C_file)

    m = numpy.mean(data, axis=0)
    data -= center
    print " Covariance matrix..."
    C = numpy.cov(numpy.transpose(data))
    print "Inverse Covariance matrix..."
    CI = numpy.linalg.pinv(C, rcond=1e-10)
    print "Square Root Inverse Covariance matrix..."
    print CI
    SQI = sqrtm(CI).real

    print "Save Square Root Inverse Covariance matrix..."
    numpy.save(SQRTICM_file, SQI)

    print "Transforming..."
    MT = numpy.dot(data, SQI.T)  # mahalanobis transformed data
    print " Saving Mahalanobis Transformed Data..."
    numpy.save(MTD_file, MT)

    return True
예제 #15
0
파일: orngLR.py 프로젝트: stefie10/slu_hri
    def __call__(self, examples, weight=0):
        # next function changes data set to a extended with unknown values
        def createLogRegExampleTable(data, weightID):
            finalData = orange.ExampleTable(data)
            origData = orange.ExampleTable(data)
            for at in data.domain.attributes:
                # za vsak atribut kreiraj nov newExampleTable newData
                # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable
                if at.varType == orange.VarTypes.Continuous:
                    atDisc = orange.FloatVariable(at.name + "Disc")
                    newDomain = orange.Domain(origData.domain.attributes +
                                              [atDisc, data.domain.classVar])
                    newDomain.addmetas(newData.domain.getmetas())
                    finalData = orange.ExampleTable(newDomain, finalData)
                    newData = orange.ExampleTable(newDomain, origData)
                    origData = orange.ExampleTable(newDomain, origData)
                    for d in origData:
                        d[atDisc] = 0
                    for d in finalData:
                        d[atDisc] = 0
                    for i, d in enumerate(newData):
                        d[atDisc] = 1
                        d[at] = 0
                        d[weightID] = 100 * data[i][weightID]

                elif at.varType == orange.VarTypes.Discrete:
                    # v dataOrig, dataFinal in newData atributu "at" dodaj ee  eno  vreednost, ki ima vrednost kar  ime atributa +  "X"
                    atNew = orange.EnumVariable(at.name,
                                                values=at.values +
                                                [at.name + "X"])
                    newDomain = orange.Domain(
                        filter(lambda x: x != at, origData.domain.attributes) +
                        [atNew, origData.domain.classVar])
                    newDomain.addmetas(origData.domain.getmetas())
                    temp_finalData = orange.ExampleTable(finalData)
                    finalData = orange.ExampleTable(newDomain, finalData)
                    newData = orange.ExampleTable(newDomain, origData)
                    temp_origData = orange.ExampleTable(origData)
                    origData = orange.ExampleTable(newDomain, origData)
                    for i, d in enumerate(origData):
                        d[atNew] = temp_origData[i][at]
                    for i, d in enumerate(finalData):
                        d[atNew] = temp_finalData[i][at]
                    for i, d in enumerate(newData):
                        d[atNew] = at.name + "X"
                        d[weightID] = 10 * data[i][weightID]
                finalData.extend(newData)
            return finalData

        learner = LogRegLearner(imputer=orange.ImputerConstructor_average(),
                                removeSingular=self.removeSingular)
        # get Original Model
        orig_model = learner(examples, weight)

        # get extended Model (you should not change data)
        if weight == 0:
            weight = orange.newmetaid()
            examples.addMetaAttribute(weight, 1.0)
        extended_examples = createLogRegExampleTable(examples, weight)
        extended_model = learner(extended_examples, weight)

        ##        print examples[0]
        ##        printOUT(orig_model)
        ##        print orig_model.domain
        ##        print orig_model.beta

        ##        printOUT(extended_model)
        # izracunas odstopanja
        # get sum of all betas
        beta = 0
        betas_ap = []
        for m in extended_models:
            beta_add = m.beta[m.continuizedDomain.attributes[-1]]
            betas_ap.append(beta_add)
            beta = beta + beta_add

        # substract it from intercept
        #print "beta", beta
        logistic_prior = orig_model.beta[0] + beta

        # compare it to bayes prior
        bayes = orange.BayesLearner(examples)
        bayes_prior = math.log(bayes.distribution[1] / bayes.distribution[0])

        # normalize errors
        #print "bayes", bayes_prior
        #print "lr", orig_model.beta[0]
        #print "lr2", logistic_prior
        #print "dist", orange.Distribution(examples.domain.classVar,examples)
        k = (bayes_prior - orig_model.beta[0]) / (logistic_prior -
                                                  orig_model.beta[0])
        #print "prej", betas_ap
        betas_ap = [k * x for x in betas_ap]
        #print "potem", betas_ap

        # vrni originalni model in pripadajoce apriorne niclele
        return (orig_model, betas_ap)
예제 #16
0
def defaultImputer(dataset):
    """Default imputer with average data imputaton."""
    return orange.ImputerConstructor_average(dataset)
예제 #17
0
    def __call__(self, trainingData, weight=None):
        """Creates an PLS model from the data in trainingData. """
        if not AZBaseClasses.AZLearner.__call__(self, trainingData, weight):
            return None
        #Remove from the domain any unused values of discrete attributes including class
        trainingData = dataUtilities.getDataWithoutUnusedValues(
            trainingData, True)
        # Create path for the Orange data
        scratchdir = miscUtilities.createScratchDir(desc="PLS")
        OrngFile = os.path.join(scratchdir, "OrngData.tab")

        # Remove meta attributes from training data to make the imputer work with examples without the meta attributes.
        #dataUtilities.rmAllMeta(trainingData)
        if len(trainingData.domain.getmetas()) == 0:
            trainData = trainingData
        else:
            trainData = dataUtilities.getCopyWithoutMeta(trainingData)

# Create the imputer
        self.imputer = orange.ImputerConstructor_average(trainData)
        # Impute the data
        trainData = self.imputer(trainData)
        # Save the Data already imputed to an Orange formated file
        if self.verbose > 1:
            print time.asctime(), "Saving Orange Data to a tab file..."
        orange.saveTabDelimited(OrngFile, trainData)
        if self.verbose > 1: print time.asctime(), "done"

        # Create the PLS instance
        if self.verbose > 1: print time.asctime(), "Creating PLS Object..."
        learner = pls.PlsAPI()
        if self.verbose > 1: print time.asctime(), "done"

        # Assign the PLS parameters
        learner.SetParameter('v', str(self.verbose))
        learner.SetParameter('debug', str(int(self.verbose > 0)))
        learner.SetParameter('method', self.method)
        if types.IntType(self.k) > len(trainData.domain.attributes):
            learner.SetParameter('k', str(len(trainData.domain.attributes)))
            if self.verbose > 0:
                print "Warning! The number of components were more than the number of attributes."
            if self.verbose > 0:
                print "   Components were set to ", len(
                    trainData.domain.attributes)
        else:
            learner.SetParameter('k', self.k)
        learner.SetParameter('precision', self.precision)
        learner.SetParameter('sDir', scratchdir)  #AZOC.SCRATCHDIR)

        # Read the Orange Formated file and Train the Algorithm
        # TRAIN
        if self.verbose > 1: print time.asctime(), "Training..."
        learner.Train(OrngFile)
        if self.verbose > 1:
            print "Train finished at ", time.asctime()
            print "PLS trained in: " + str(
                learner.GetCPUTrainTime()) + " seconds"
            print "Method:     " + learner.GetParameter("method")
            print "Components: " + learner.GetParameter("k")
            print "Precision:  " + learner.GetParameter("precision")

        # Remove the scratch file
        if self.verbose == 0:
            miscUtilities.removeDir(scratchdir)
        else:
            print "The directory " + scratchdir + " was not deleted because DEBUG flag is ON"
        del trainData
        impData = self.imputer.defaults
        return PLSClassifier(
            classifier=learner,
            name="Classifier of " + self.name,
            classVar=trainingData.domain.classVar,
            imputeData=impData,
            verbose=self.verbose,
            varNames=[attr.name for attr in trainingData.domain.attributes],
            NTrainEx=len(trainingData),
            basicStat=self.basicStat,
            parameters=self.parameters)  #learner.GetClassVarName())#