예제 #1
0
파일: LibSVM.py 프로젝트: kentwang/sandbox
    def parallelVfcvRbf(self, X, y, idx, type="C_SVC"):
        """
        Perform parallel cross validation model selection using the RBF kernel
        and then pick the best one. Using the best set of parameters train using
        the whole dataset.

        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param idx: A list of train/test splits

        :params returnGrid: Whether to return the error grid
        :type returnGrid: :class:`bool`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)

        self.setKernel("gaussian")

        if type == "C_SVC":
            paramDict = {}
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()
        else:
            paramDict = {}
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()
            paramDict["setEpsilon"] = self.getEpsilons()

        return self.parallelModelSelect(X, y, idx, paramDict)
예제 #2
0
 def __init__(self, kernel, tau1, tau2):
     Parameter.checkFloat(tau1, 0.0, float('inf'))
     Parameter.checkFloat(tau2, 0.0, float('inf'))
     Parameter.checkClass(kernel, AbstractKernel)
     self.tau1 = tau1
     self.tau2 = tau2
     self.kernel = kernel
예제 #3
0
def parallelPenaltyGridRbf(svm, X, y, fullX, gridPoints, pdfX, pdfY1X, pdfYminus1X):
    """
    Find out the "ideal" penalty.
    """
    Parameter.checkClass(X, numpy.ndarray)
    Parameter.checkClass(y, numpy.ndarray)
    chunkSize = 10

    idealPenalties = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
    paramList = []

    for i in range(svm.Cs.shape[0]):
        for j in range(svm.gammas.shape[0]):
            paramList.append((X, y, fullX, svm.Cs[i], svm.gammas[j], gridPoints, pdfX, pdfY1X, pdfYminus1X))

    pool = multiprocessing.Pool()
    resultsIterator = pool.imap(computeIdealPenalty, paramList, chunkSize)

    for i in range(svm.Cs.shape[0]):
        for j in range(svm.gammas.shape[0]):
            idealPenalties[i, j] = resultsIterator.next()

    pool.terminate()

    return idealPenalties
예제 #4
0
    def auc(predY, trueY):
        """
        Can be used in conjunction with evaluateCV using the scores, and true
        labels. Note the order of parameters. 
        """
        try:
            import sklearn.metrics
        except ImportError:
            raise

        Parameter.checkClass(predY, numpy.ndarray)
        Parameter.checkClass(trueY, numpy.ndarray)
        if predY.ndim != 1:
            raise ValueError("Expecting predY to be 1D")
        if trueY.ndim != 1:
            raise ValueError("Expecting trueY to be 1D")
        if numpy.unique(trueY).shape[0] > 2:
            raise ValueError("Found more than two label types in trueY")

        if numpy.unique(trueY).shape[0] == 1:
            return 0.5

        fpr, tpr, threshold = sklearn.metrics.roc_curve(
            trueY.ravel(), predY.ravel())
        return sklearn.metrics.metrics.auc(fpr, tpr)
예제 #5
0
    def learnModel(self, X, Y):
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(Y)

        if numpy.unique(Y).shape[0] < 2:
            raise ValueError("Vector of labels must be binary, currently numpy.unique(Y) = " + str(numpy.unique(Y)))

        #If Y is 1D make it 2D
        if Y.ndim == 1:
            Y = numpy.array([Y]).T
        
        XY = self._getDataFrame(X, Y)
        formula = robjects.Formula('class ~ .')
        self.learnModelDataFrame(formula, XY)

        gc.collect()
        robjects.r('gc(verbose=TRUE)')
        robjects.r('memory.profile()')
        gc.collect()

        if self.printMemStats:
            logging.debug(self.getLsos()())
            logging.debug(ProfileUtils.memDisplay(locals()))
예제 #6
0
파일: Util.py 프로젝트: kentwang/sandbox
    def randomChoice(V, n=1):
        """
        Make a random choice from a vector V of values which are unnormalised
        probabilities. Return the corresponding index. For example if v = [1, 2, 4]
        then the probability of the indices repectively are [1/7, 2/7, 4/7]. The
        parameter n is the number of random choices to make. If V is a matrix,
        then the rows are taken as probabilities, and a choice is made for each
        row. 
        """
        Parameter.checkClass(V, numpy.ndarray)

        if V.shape[0] == 0:
            return -1

        if V.ndim == 1:
            cumV = numpy.cumsum(V)
            p = numpy.random.rand(n) * cumV[-1]
            return numpy.searchsorted(cumV, p)
        elif V.ndim == 2:
            cumV = numpy.cumsum(V, 1)
            P = numpy.random.rand(V.shape[0], n) * numpy.array([cumV[:, -1]]).T

            inds = numpy.zeros(P.shape, numpy.int)
            for i in range(P.shape[0]):
                inds[i, :] = numpy.searchsorted(cumV[i, :], P[i, :])

            return inds
        else:
            raise ValueError("Invalid number of dimensions")
예제 #7
0
    def parallelVfcvRbf(self, X, y, idx, type="C_SVC"):
        """
        Perform parallel cross validation model selection using the RBF kernel
        and then pick the best one. Using the best set of parameters train using
        the whole dataset.

        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param idx: A list of train/test splits

        :params returnGrid: Whether to return the error grid
        :type returnGrid: :class:`bool`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)

        self.setKernel("gaussian")

        if type=="C_SVC":
            paramDict = {} 
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()  
        else: 
            paramDict = {} 
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()  
            paramDict["setEpsilon"] = self.getEpsilons()  
                
        return self.parallelModelSelect(X, y, idx, paramDict)
예제 #8
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :return: A vector of scores corresponding to each example. 
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X)

        scores = numpy.zeros(X.shape[0])
        root = self.tree.getVertex((0, 0))
        root.setTestInds(numpy.arange(X.shape[0]))

        #We go down the tree making predictions at each stage 
        for d in range(self.maxDepth+1):
            for k in range(2**d):
                if self.tree.vertexExists((d, k)):
                    self.classifyNode(self.tree, X, d, k)

                    node = self.tree.getVertex((d,k))
                    if node.isLeafNode():
                        inds = node.getTestInds()
                        scores[inds] = node.getScore()

        return scores 
예제 #9
0
 def __init__(self, kernel, tau1, tau2):
     Parameter.checkFloat(tau1, 0.0, float('inf'))
     Parameter.checkFloat(tau2, 0.0, float('inf'))
     Parameter.checkClass(kernel, AbstractKernel)
     self.tau1 = tau1
     self.tau2 = tau2
     self.kernel = kernel
예제 #10
0
    def evaluate(self, X1, X2):
        """
        Find kernel evaluation between two matrices X1 and X2 whose rows are
        examples and have an identical number of columns.


        :param X1: First set of examples.
        :type X1: :class:`numpy.ndarray`

        :param X2: Second set of examples.
        :type X2: :class:`numpy.ndarray`
        """
        Parameter.checkClass(X1, numpy.ndarray)
        Parameter.checkClass(X2, numpy.ndarray)
        
        if X1.shape[1] != X2.shape[1]:
            raise ValueError("Invalid matrix dimentions: " + str(X1.shape) + " " + str(X2.shape))

        j1 = numpy.ones((X1.shape[0], 1))
        j2 = numpy.ones((X2.shape[0], 1))

        diagK1 = numpy.sum(X1**2, 1)
        diagK2 = numpy.sum(X2**2, 1)

        X1X2 = numpy.dot(X1, X2.T)

        Q = (2*X1X2 - numpy.outer(diagK1, j2) - numpy.outer(j1, diagK2) )/ (2*self.sigma**2)

        return numpy.exp(Q)
예제 #11
0
파일: Util.py 프로젝트: kentwang/sandbox
    def random2Choice(V, n=1):
        """
        Make a random binary choice from a vector V of values which are unnormalised
        probabilities. Return the corresponding index. For example if v = [1, 2]
        then the probability of the indices repectively are [1/3, 2/3]. The
        parameter n is the number of random choices to make. If V is a matrix,
        then the rows are taken as probabilities, and a choice is made for each
        row.
        """
        Parameter.checkClass(V, numpy.ndarray)

        if V.ndim == 1 and V.shape[0] != 2:
            raise ValueError("Function only works on binary probabilities")
        if V.ndim == 2 and V.shape[1] != 2:
            raise ValueError("Function only works on binary probabilities")

        if V.ndim == 1:
            cumV = numpy.cumsum(V)
            p = numpy.random.rand(n) * cumV[-1]
            cumV2 = numpy.ones(n) * cumV[0] - p
            return numpy.array(cumV2 <= 0, numpy.int)
        elif V.ndim == 2:
            cumV = numpy.cumsum(V, 1)
            P = numpy.random.rand(V.shape[0], n) * numpy.array([cumV[:, -1]]).T
            cumV2 = numpy.outer(cumV[:, 0], numpy.ones(n)) - P
            return numpy.array(cumV2 <= 0, numpy.int)
        else:
            raise ValueError("Invalid number of dimensions")
예제 #12
0
    def learnModel(self, X, Y):
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(Y)

        if numpy.unique(Y).shape[0] < 2:
            raise ValueError(
                "Vector of labels must be binary, currently numpy.unique(Y) = "
                + str(numpy.unique(Y)))

        #If Y is 1D make it 2D
        if Y.ndim == 1:
            Y = numpy.array([Y]).T

        XY = self._getDataFrame(X, Y)
        formula = robjects.Formula('class ~ .')
        self.learnModelDataFrame(formula, XY)

        gc.collect()
        robjects.r('gc(verbose=TRUE)')
        robjects.r('memory.profile()')
        gc.collect()

        if self.printMemStats:
            logging.debug(self.getLsos()())
            logging.debug(ProfileUtils.memDisplay(locals()))
예제 #13
0
 def __init__(self, fileName):
     """
     Lock a job whose results are saved as fileName. 
     """
     Parameter.checkClass(fileName, str)
     self.fileName = fileName
     self.lockFileName = self.fileName + ".lock"
예제 #14
0
    def __init__(self,
                 algorithm="PATH",
                 alpha=0.5,
                 featureInds=None,
                 useWeightM=True):
        """
        Intialise the matching object with a given algorithm name, alpha 
        which is a trade of between matching adjacency matrices and vertex labels, 
        and featureInds which is an option array of indices to use for label 
        matching. 
        
        :param alpha: A value in [0, 1] which is smaller to match graph structure, larger to match the labels more  
        """
        Parameter.checkFloat(alpha, 0.0, 1.0)
        Parameter.checkClass(algorithm, str)

        self.algorithm = algorithm
        self.alpha = alpha
        self.maxInt = 10**9
        self.featureInds = featureInds
        self.useWeightM = useWeightM
        #Gamma is the same as dummy_nodes_c_coef for costing added vertex labels
        self.gamma = 0.0
        #Same as dummy_nodes_fill
        self.rho = 0.5
        self.init = "rand"
        self.lambdaM = 50
예제 #15
0
    def predict(self, X):
        """
        Basically, return the scores.
        """
        Parameter.checkClass(X, numpy.ndarray)

        scores = self.predictScores(X)
        return scores
예제 #16
0
    def __init__(self, kernelX, tau1, tau2):
        Parameter.checkFloat(tau1, 0.0, 1.0)
        Parameter.checkFloat(tau2, 0.0, 1.0)
        Parameter.checkClass(kernelX, AbstractKernel)

        self.kernelX = kernelX
        self.tau1 = tau1
        self.tau2 = tau2
예제 #17
0
    def __init__(self, kernelX, tau1, tau2):
        Parameter.checkFloat(tau1, 0.0, 1.0)
        Parameter.checkFloat(tau2, 0.0, 1.0)
        Parameter.checkClass(kernelX, AbstractKernel)

        self.kernelX = kernelX
        self.tau1 = tau1
        self.tau2 = tau2
예제 #18
0
    def predict(self, X):
        """
        Basically, return the scores.
        """
        Parameter.checkClass(X, numpy.ndarray)

        scores = self.predictScores(X)
        return scores
예제 #19
0
    def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True):
        """
        Evaluate this learning algorithm using the given list of training/test splits 
        The metricMethod is a method which takes (predictedY, realY) as input
        and returns a metric about the quality of the evaluation.

        :param X: A matrix with examples as rows 
        :type X: :class:`ndarray`

        :param y: A vector of labels 
        :type y: :class:`ndarray`

        :param idx: A list of training/test splits 
        :type idx: :class:`list`

        :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y 
        :type learnModel: :class:`function`

        :param predict: A function such that predict(X) makes predictions for X
        :type predict: :class:`function`

        :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY
        :type metricMethod: :class:`function`

        Output: the mean and variation of the cross validation folds. 
        """
        #Parameter.checkClass(idx, list)
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X, softCheck=True)
        Parameter.checkInt(X.shape[0], 1, float('inf'))
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(y, softCheck=True)

        if y.ndim != 1:
            raise ValueError("Dimention of y must be 1")
        
        i = 0
        metrics = numpy.zeros(len(idx))
        logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples")

        for idxtr, idxts in idx:
            if progress:
                Util.printConciseIteration(i, 1, len(idx))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = y[idxtr], y[idxts]
            #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY)))
            #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY)))

            learnModel(trainX, trainY)
            predY = predict(testX)
            gc.collect()

            metrics[i] = metricMethod(predY, testY)
            i += 1

        return metrics
예제 #20
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #21
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #22
0
    def __init__(self, alterRegressor, egoRegressor):
        """
        The alterRegressor must be a primal method, since the number of alters
        for each ego vary, and hence the dual vectors are not constant in size.
        """
        Parameter.checkClass(alterRegressor, AbstractPredictor)
        Parameter.checkClass(egoRegressor, AbstractPredictor)

        self.alterRegressor = alterRegressor
        self.egoRegressor = egoRegressor
예제 #23
0
    def standardiseArray(self, X):
        """
        Centre and then normalise an array to have norm 1.
        """
        Parameter.checkClass(X, numpy.ndarray)

        X = self.centreArray(X)
        X = self.normaliseArray(X)

        return X
예제 #24
0
    def predictScores(self, X):
        """
        Make predictions using the learnt tree. Returns the scores as a numpy array.
        """
        Parameter.checkClass(X, numpy.ndarray)

        predictFunc = robjects.r['predict']
        X = self.baseLib.data_frame(X)
        scores = self.baseLib.matrix(predictFunc(self.getModel(), X))
        return numpy.asarray(scores).ravel()
예제 #25
0
    def __init__(self, alterRegressor, egoRegressor):
        """
        The alterRegressor must be a primal method, since the number of alters
        for each ego vary, and hence the dual vectors are not constant in size.
        """
        Parameter.checkClass(alterRegressor, AbstractPredictor)
        Parameter.checkClass(egoRegressor, AbstractPredictor)

        self.alterRegressor = alterRegressor
        self.egoRegressor = egoRegressor
예제 #26
0
    def standardiseArray(self, X):
        """
        Centre and then normalise an array to have norm 1.
        """
        Parameter.checkClass(X, numpy.ndarray)

        X = self.centreArray(X)
        X = self.normaliseArray(X)

        return X
예제 #27
0
    def predictScores(self, X):
        """
        Make predictions using the learnt tree. Returns the scores as a numpy array.
        """
        Parameter.checkClass(X, numpy.ndarray)

        predictFunc = robjects.r['predict']
        X = self.baseLib.data_frame(X)
        scores = self.baseLib.matrix(predictFunc(self.getModel(), X))
        return numpy.asarray(scores).ravel()
예제 #28
0
    def predictROC(self, X, Y):
        """
        Make predictions using the learnt tree. Returns the ROC curve as a numpy
        array
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)

        XY = self._getDataFrame(X, Y)
        XYROC = self.treeRankLib.getROC(self.getModel(), XY)
        return numpy.array(XYROC)
예제 #29
0
    def binaryError(testY, predY):
        """
        Work out the error on a set of -1/+1 labels
        """
        Parameter.checkClass(testY, numpy.ndarray)
        Parameter.checkClass(predY, numpy.ndarray)
        if testY.shape[0] != predY.shape[0]:
            raise ValueError("Labels vector much be same dimensions as predicted labels")

        error = numpy.sum(testY != predY)/float(predY.shape[0]) 
        return error
예제 #30
0
    def predictROC(self, X, Y):
        """
        Make predictions using the learnt tree. Returns the ROC curve as a numpy
        array
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)

        XY = self._getDataFrame(X, Y)
        XYROC = self.treeRankLib.getROC(self.getModel(), XY)
        return numpy.array(XYROC)
예제 #31
0
    def binaryError(testY, predY):
        """
        Work out the error on a set of -1/+1 labels
        """
        Parameter.checkClass(testY, numpy.ndarray)
        Parameter.checkClass(predY, numpy.ndarray)
        if testY.shape[0] != predY.shape[0]:
            raise ValueError(
                "Labels vector much be same dimensions as predicted labels")

        error = numpy.sum(testY != predY) / float(predY.shape[0])
        return error
예제 #32
0
    def _getDataFrame(self, X, Y):
        """
        Create a DataFrame from numpy arrays X and Y
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)

        X = self.baseLib.data_frame(robjects.vectors.Matrix(X))
        Y = self.baseLib.data_frame(robjects.vectors.Matrix(Y))

        XY = X.cbind(Y)
        XY.names[len(XY.names) - 1] = "class"
        return XY
예제 #33
0
    def _getDataFrame(self, X, Y):
        """
        Create a DataFrame from numpy arrays X and Y
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)

        X = self.baseLib.data_frame(robjects.vectors.Matrix(X))
        Y = self.baseLib.data_frame(robjects.vectors.Matrix(Y))

        XY = X.cbind(Y)
        XY.names[len(XY.names)-1] = "class"
        return XY
예제 #34
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " +
                          str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " +
                          str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #35
0
    def summary(self, graph): 
        """
        Compute a summary statistic on the input HIV graph         
        """
        Parameter.checkClass(graph, HIVGraph)
        summaryArray = numpy.zeros((self.times.shape[0], 2))

        for i in range(self.times.shape[0]): 
            t = self.times[i]
            subgraph = graph.subgraph(graph.infectedIndsAt(t))    
        
            summaryArray[i, :] = numpy.array([subgraph.getNumVertices(), subgraph.getNumEdges()])        
        
        return summaryArray
예제 #36
0
파일: Latex.py 프로젝트: kentwang/sandbox
    def listToRow(lst):
        """
        Take a list and convert into a row of a latex table.
        """
        Parameter.checkClass(lst, list)
        outputStr = ""

        for i in range(len(lst)):
            if i != len(lst) - 1:
                outputStr += str(lst[i]) + " & "
            else:
                outputStr += str(lst[i]) + "\\\\"

        return outputStr
예제 #37
0
파일: Latex.py 프로젝트: rezaarmand/sandbox
    def listToRow(lst):
        """
        Take a list and convert into a row of a latex table.
        """
        Parameter.checkClass(lst, list)
        outputStr = ""

        for i in range(len(lst)):
            if i != len(lst) - 1:
                outputStr += str(lst[i]) + " & "
            else:
                outputStr += str(lst[i]) + "\\\\"

        return outputStr
예제 #38
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #39
0
    def parallelPenaltyGrid(self,
                            trainX,
                            trainY,
                            fullX,
                            fullY,
                            paramDict,
                            errorFunc=computeIdealPenalty):
        """
        Find out the "ideal" penalty using a training set and the full dataset. If one specifies 
        a different error function then that is computed over the grid of parameters. 
        """
        Parameter.checkClass(trainX, numpy.ndarray)
        Parameter.checkClass(trainY, numpy.ndarray)

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        idealPenalties = numpy.zeros(tuple(gridSize))

        indexIter = itertools.product(*gridInds)
        paramList = []
        for inds in indexIter:
            learner = self.copy()
            currentInd = 0

            for key, val in paramDict.items():
                method = getattr(learner, key)
                method(val[inds[currentInd]])
                currentInd += 1

            paramList.append((trainX, trainY, fullX, fullY, learner))

        pool = multiprocessing.Pool(processes=self.processes,
                                    maxtasksperchild=100)
        resultsIterator = pool.imap(errorFunc, paramList, self.chunkSize)
        indexIter = itertools.product(*gridInds)

        for inds in indexIter:
            idealPenalties[inds] = resultsIterator.next()

        pool.terminate()

        return idealPenalties
예제 #40
0
파일: Util.py 프로젝트: kentwang/sandbox
    def expandIntArray(v):
        """
        Take a vector of integers and expand it into a vector with counts of the
        corresponding integers. For example, with v = [1, 3, 2, 4], the expanded
        vector is [0, 1, 1, 1, 2, 2, 3, 3, 3, 3]. 
        """
        Parameter.checkClass(v, numpy.ndarray)
        Parameter.checkList(v, Parameter.checkInt, [0, float("inf")])

        w = numpy.zeros(numpy.sum(v), numpy.int)
        currentInd = 0

        for i in range(v.shape[0]):
            w[currentInd : currentInd + v[i]] = i
            currentInd += v[i]

        return w
예제 #41
0
 def parallelPenaltyGridRbf(self, trainX, trainY, fullX, fullY, type="C_SVC"):
     """
     Find out the "ideal" penalty. 
     """
     Parameter.checkClass(trainX, numpy.ndarray)
     Parameter.checkClass(trainY, numpy.ndarray)
     if type=="C_SVC":
         paramDict = {} 
         paramDict["setC"] = self.getCs()
         paramDict["setGamma"] = self.getGammas()  
     else: 
         paramDict = {} 
         paramDict["setC"] = self.getCs()
         paramDict["setGamma"] = self.getGammas()  
         paramDict["setEpsilon"] = self.getEpsilons()  
             
     return self.parallelPenaltyGrid(trainX, trainY, fullX, fullY, paramDict)
예제 #42
0
파일: LibSVM.py 프로젝트: kentwang/sandbox
    def parallelPenaltyGridRbf(self, trainX, trainY, fullX, fullY, type="C_SVC"):
        """
        Find out the "ideal" penalty. 
        """
        Parameter.checkClass(trainX, numpy.ndarray)
        Parameter.checkClass(trainY, numpy.ndarray)
        if type == "C_SVC":
            paramDict = {}
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()
        else:
            paramDict = {}
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()
            paramDict["setEpsilon"] = self.getEpsilons()

        return self.parallelPenaltyGrid(trainX, trainY, fullX, fullY, paramDict)
예제 #43
0
    def eigenRemove(omega, Q, n, k, debug=False):
        """
        Remove a set of rows and columns from a matrix whose eigen-decomposition
        is Q diag(omega) Q^T. Keep the first n rows/cols i.e. the rows/cols starting
        from n to the end are removed and k is the number of eigenvectors/values
        to return for the new matrix. We could generalise this to delete a given
        list of rows/cols.
        """
        #logging.debug("< eigenRemove >")
        Parameter.checkClass(omega, numpy.ndarray)
        Parameter.checkClass(Q, numpy.ndarray)
        Parameter.checkInt(k, 0, float('inf'))
        Parameter.checkInt(n, 0, Q.shape[0])
        if omega.ndim != 1:
            raise ValueError("omega must be 1-d array")
        if omega.shape[0] != Q.shape[1]:
            raise ValueError("Must have same number of eigenvalues and eigenvectors")

        if __debug__:
            Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="input Q in eigenRemove()")

        inds = numpy.flipud(numpy.argsort(numpy.abs(omega)))
        inds = inds[omega[inds]>EigenUpdater.tol]
        
        omega, Q = Util.indEig(omega, Q, inds[0:k])
        AB = (Q[0:n, :]*omega).dot(Q[n:, :].T)
        BB = (Q[n:, :]*omega).dot(Q[n:, :].T)

        p = BB.shape[0]
        Y1 = numpy.r_[numpy.zeros((n, p)), numpy.eye(p)]
        Y2 = -numpy.r_[AB, 0.5*BB]
        pi, V = EigenUpdater.eigenAdd2(omega, Q, Y1, Y2, k)

        #check last rows are zero
        if numpy.linalg.norm(V[n:, :]) >= EigenUpdater.tol:
            logging.warn("numpy.linalg.norm(V[n:, :])= %s" % str(numpy.linalg.norm(V[n:, :])))

        #logging.debug("</ eigenRemove >")
        if not debug:
            return pi, V[0:n, :]
        else:
            C = (Q*omega).dot(Q.T)
            K = C + Y1.dot(Y2.T) + Y2.dot(Y1.T)
            assert numpy.linalg.norm(BB- C[n:, n:]) <= EigenUpdater.tol
            assert numpy.linalg.norm(AB - C[0:n, n:]) <= EigenUpdater.tol, "%s \n %s" % (AB, C[0:n, n:])
            return pi, V[0:n, :], K, Y1, Y2, omega
예제 #44
0
    def learnModel(self, X, y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(y)

        labels = numpy.unique(y)
        if labels.shape[0] != 2:
            raise ValueError("Can only accept binary labelled data")
        if (labels != numpy.array([-1, 1])).any():
            raise ValueError("Labels must be -1/+1: " + str(labels))

        forestList = []
        indList = []
        numSampledExamples = int(numpy.round(self.sampleSize * X.shape[0]))

        for i in range(self.numTrees):
            Util.printConciseIteration(i, 1, self.numTrees, "Tree: ")
            if self.sampleReplace:
                inds = numpy.random.randint(0, X.shape[0], numSampledExamples)
            else:
                inds = numpy.random.permutation(
                    X.shape[0])[0:numSampledExamples]

            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(self.maxDepth)
            treeRank.setMinSplit(self.minSplit)
            treeRank.setFeatureSize(self.featureSize)
            treeRank.setBestResponse(self.bestResponse)
            treeRank.learnModel(X[inds, :], y[inds])
            forestList.append(treeRank)
            indList.append(inds)

        self.forestList = forestList
        self.indList = indList
예제 #45
0
파일: Util.py 프로젝트: kentwang/sandbox
    def matrixPowerh(A, n):
        """
        Compute the matrix power of A using the exponent n. The computation simply
        evaluated the eigendecomposition of A and then powers the eigenvalue
        matrix accordingly.
        
        This version assumes that A is hermitian.
        Warning: if at least one eigen-value is negative, n should be an integer.
        """
        Parameter.checkClass(A, numpy.ndarray)
        tol = 10 ** -10

        lmbda, V = scipy.linalg.eigh(A)
        lmbda[numpy.abs(lmbda) < tol] = 0
        lmbda[numpy.abs(lmbda) > tol] = lmbda[numpy.abs(lmbda) > tol] ** n
        # next line uses the fact that eigh claims returning an orthonormal basis (even if
        # one sub-space is of dimension >=2) (to be precise, it claims using dsyevd which claims returning an orthonormal matrix)
        return (V * lmbda).dot(V.T)
예제 #46
0
    def learnModel(self, X, y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(y)
        
        labels = numpy.unique(y)
        if labels.shape[0] != 2:
            raise ValueError("Can only accept binary labelled data")
        if (labels != numpy.array([-1, 1])).any(): 
            raise ValueError("Labels must be -1/+1: " + str(labels))

        forestList = []
        indList = []
        numSampledExamples = int(numpy.round(self.sampleSize*X.shape[0]))

        for i in range(self.numTrees):
            Util.printConciseIteration(i, 1, self.numTrees, "Tree: ")
            if self.sampleReplace:
                inds = numpy.random.randint(0, X.shape[0], numSampledExamples)
            else:
                inds = numpy.random.permutation(X.shape[0])[0:numSampledExamples]

            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(self.maxDepth)
            treeRank.setMinSplit(self.minSplit)
            treeRank.setFeatureSize(self.featureSize)
            treeRank.setBestResponse(self.bestResponse)
            treeRank.learnModel(X[inds, :], y[inds])
            forestList.append(treeRank)
            indList.append(inds)

        self.forestList = forestList
        self.indList = indList
예제 #47
0
    def evaluate(self, X1, X2):
        """
        Find kernel evaluation between two matrices X1 and X2 whose rows are
        examples and have an identical number of columns.


        :param X1: First set of examples.
        :type X1: :class:`numpy.ndarray`

        :param X2: Second set of examples.
        :type X2: :class:`numpy.ndarray`
        """
        Parameter.checkClass(X1, numpy.ndarray)
        Parameter.checkClass(X2, numpy.ndarray)

        if X1.shape[1] != X2.shape[1]:
            raise ValueError("Invalid matrix dimentions: " + str(X1.shape) + " " + str(X2.shape))

        return numpy.dot(X1, X2.T)
예제 #48
0
    def learnModel(self, X, Y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array Y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param Y: A vector of binary labels as a 1D array
        :type Y: :class:`ndarray`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(Y)
        labels = numpy.unique(Y)
        if labels.shape[0] != 2:
            raise ValueError("Can only accept binary labelled data: " + str(labels))
        if (labels != numpy.array([-1, 1])).any(): 
            raise ValueError("Labels must be -1/+1: " + str(labels))
        if self.featureSize == None: 
            featureSize = numpy.sqrt(X.shape[1])/float(X.shape[1])
        else: 
            featureSize = self.featureSize

        tree = DictTree()
        trainInds = numpy.arange(Y.shape[0])
        featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) 

        #Seed the tree
        node = RankNode(trainInds, featureInds)
        tree.setVertex((0, 0), node)

        for d in range(self.maxDepth):
            for k in range(2**d):
                if tree.vertexExists((d, k)):
                    node = tree.getVertex((d, k))

                    if not node.isPure() and not node.isLeafNode():
                        self.splitNode(tree, X, Y, d, k)

        self.tree = tree 
예제 #49
0
    def learnModel(self, X, Y):
        """
        Learn the weight matrix which matches X and Y.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(X.shape[0], 1, float('inf'))
        Parameter.checkInt(X.shape[1], 1, float('inf'))

        self.pdcca = PrimalDualCCA(self.kernel, self.tau1, self.tau2)
        alpha, V, lmbdas = self.pdcca.learnModel(X, Y)

        a = 10**-5
        I = numpy.eye(V.shape[0])
        VV = numpy.dot(V, V.T) + a * I

        self.A = Util.mdot(alpha, V.T, numpy.linalg.inv(VV))
        self.X = X

        return self.A
예제 #50
0
    def evaluate(self, X1, X2):
        """
        Find kernel evaluation between two matrices X1 and X2 whose rows are
        examples and have an identical number of columns.


        :param X1: First set of examples.
        :type X1: :class:`numpy.ndarray`

        :param X2: Second set of examples.
        :type X2: :class:`numpy.ndarray`
        """
        Parameter.checkClass(X1, numpy.ndarray)
        Parameter.checkClass(X2, numpy.ndarray)

        if X1.shape[1] != X2.shape[1]:
            raise ValueError("Invalid matrix dimentions: " + str(X1.shape) +
                             " " + str(X2.shape))

        return numpy.dot(X1, X2.T)
예제 #51
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.
        The set of scores is the mean over all the trees in the forest.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :return: A vector of scores corresponding to each example.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X)

        scores = numpy.zeros(X.shape[0])

        for i in range(self.numTrees):
            scores += self.forestList[i].predict(X)

        scores = scores / self.numTrees
        return scores
예제 #52
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.
        The set of scores is the mean over all the trees in the forest.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :return: A vector of scores corresponding to each example.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X)

        scores = numpy.zeros(X.shape[0])

        for i in range(self.numTrees):
            scores += self.forestList[i].predict(X)

        scores = scores/self.numTrees
        return scores
예제 #53
0
    def evaluateLearn2(X, Y, indexList, learnModel, predict, metricMethods):
        """
        Evaluate a learner given  functions (learnModel, predict)
        and save metrics on the training and test sets given by metric methods.

        #Could combine this with evaluateLearn 
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X, softCheck=True)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkArray(Y, softCheck=True)

        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        trainMetrics = []
        testMetrics = []
        for i in range(len(metricMethods)):
            trainMetrics.append([])
            testMetrics.append([])

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            learnModel(trainX, trainY)
            predTrainY = predict(trainX)
            predTestY = predict(testX)

            #Now compute all metrics
            i = 0
            for metricMethod in metricMethods:
                trainMetrics[i].append(metricMethod(trainY, predTrainY))
                testMetrics[i].append(metricMethod(testY, predTestY))
                i += 1

            gc.collect()

        logging.debug("All done")

        return trainMetrics, testMetrics
예제 #54
0
    def parallelVfPenRbf(self, X, y, idx, Cvs, type="C_SVC"):
        """
        Perform v fold penalisation model selection using the RBF kernel
        and then pick the best one. Using the best set of parameters train using
        the whole dataset. Cv is the control on the amount of penalisation.

        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels
        :type y: :class:`numpy.ndarray`

        :param idx: A list of train/test splits
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkClass(Cvs, numpy.ndarray)
        
        self.setKernel("gaussian")

        if type=="C_SVC":
            paramDict = {} 
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()  
        else: 
            paramDict = {} 
            paramDict["setC"] = self.getCs()
            paramDict["setGamma"] = self.getGammas()  
            paramDict["setEpsilon"] = self.getEpsilons()  
                
        return self.parallelPen(X, y, idx, paramDict, Cvs)
예제 #55
0
    def lazyEigenConcatAsUpdate(omega, Q, AB, BB, k, debug= False):
        """
        Find the eigen update of a matrix [A, B]'[A B] where
        A'A = Q diag(omega) Q* and AB = A*B, BB = B*B. Q is the set of
        eigenvectors of A*A and omega is the vector of eigenvalues.
        
        Simply expand Q, and update the eigen decomposition using EigenAdd2.
        Computation could be upgraded a bit because of the particular update
        type (Y1Bar = Y1 = [0,I]',  Y2Bar = [(I-QQ')A'B, 0]').
        """
        #logging.debug("< lazyEigenConcatAsUpdate >")
        Parameter.checkClass(omega, numpy.ndarray)
        Parameter.checkClass(Q, numpy.ndarray)
        Parameter.checkClass(AB, numpy.ndarray)
        Parameter.checkClass(BB, numpy.ndarray)
        Parameter.checkInt(k, 0, AB.shape[0] + BB.shape[0])
        if not numpy.isrealobj(omega) or not numpy.isrealobj(Q):
            logging.info("Eigenvalues or eigenvectors are not real")
        if not numpy.isrealobj(AB) or not numpy.isrealobj(BB):
            logging.info("AB or BB are not real")
        if omega.ndim != 1:
            raise ValueError("omega must be 1-d array")
        if omega.shape[0] != Q.shape[1]:
            raise ValueError("Must have same number of eigenvalues and eigenvectors")
        if Q.shape[0] != AB.shape[0]:
            raise ValueError("Q must have the same number of rows as AB")
        if AB.shape[1] != BB.shape[0] or  BB.shape[0]!=BB.shape[1]:
            raise ValueError("AB must have the same number of cols/rows as BB")

        if __debug__:
            if not Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, investigate=True, arrayInfo="input Q in lazyEigenConcatAsUpdate()"):
                print("omega:\n", omega)


        m = Q.shape[0]
        p = BB.shape[0]
        
        Q = numpy.r_[Q, numpy.zeros((p, Q.shape[1]))]
        Y1 = numpy.r_[numpy.zeros((m,p)), numpy.eye(p)]
        Y2 = numpy.r_[AB, 0.5*BB]
        return EigenUpdater.eigenAdd2(omega, Q, Y1, Y2, k, debug=debug)
예제 #56
0
    def parallelModelSelect(self, X, y, idx, paramDict):
        """
        Perform parallel model selection using any learner. 
        Using the best set of parameters train using the whole dataset.

        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param idx: A list of train/test splits
        
        :param paramDict: A dictionary index by the method name and with value as an array of values
        :type X: :class:`dict`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        folds = len(idx)

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanErrors = numpy.zeros(tuple(gridSize))
        m = 0
        paramList = []

        for trainInds, testInds in idx:
            trainX, trainY = X[trainInds, :], y[trainInds]
            testX, testY = X[testInds, :], y[testInds]

            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()
                currentInd = 0

                for key, val in paramDict.items():
                    method = getattr(learner, key)
                    method(val[inds[currentInd]])
                    currentInd += 1

                paramList.append((trainX, trainY, testX, testY, learner))

            m += 1

        if self.processes != 1:
            pool = multiprocessing.Pool(processes=self.processes,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestError, paramList,
                                        self.chunkSize)
        else:
            resultsIterator = itertools.imap(computeTestError, paramList)

        for trainInds, testInds in idx:
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                error = resultsIterator.next()
                meanErrors[inds] += error / float(folds)

        if self.processes != 1:
            pool.terminate()

        learner = self.getBestLearner(meanErrors, paramDict, X, y, idx)

        return learner, meanErrors
예제 #57
0
    def parallelPen(self, X, y, idx, paramDict, Cvs, errorFunc=computeVFPen):
        """
        Perform parallel penalisation using any learner. 
        Using the best set of parameters train using the whole dataset.

        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param idx: A list of train/test splits

        :param paramDict: A dictionary index by the method name and with value as an array of values
        :type X: :class:`dict`

        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        folds = len(idx)

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        trainErrors = numpy.zeros(tuple(gridSize))
        penalties = numpy.zeros(tuple(gridSize))

        indexIter = itertools.product(*gridInds)
        paramList = []
        paramList2 = []

        for trainInds, testInds in idx:
            trainX, trainY = X[trainInds, :], y[trainInds]

            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()
                currentInd = 0

                for key, val in paramDict.items():
                    method = getattr(learner, key)
                    method(val[inds[currentInd]])
                    currentInd += 1

                paramList.append((trainX, trainY, X, y, learner))

        #Create parameters for learning on all examples and test on all
        indexIter = itertools.product(*gridInds)
        for inds in indexIter:
            learner = self.copy()
            currentInd = 0

            for key, val in paramDict.items():
                method = getattr(learner, key)
                method(val[inds[currentInd]])
                currentInd += 1

            paramList2.append((X, y, learner))

        pool = multiprocessing.Pool(processes=self.processes,
                                    maxtasksperchild=100)
        resultsIterator = pool.imap(errorFunc, paramList, self.chunkSize)
        resultsIterator2 = pool.imap(computeTrainError, paramList2,
                                     self.chunkSize)

        for trainInds, testInds in idx:
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                penalties[inds] += resultsIterator.next() / float(folds)

        indexIter = itertools.product(*gridInds)
        for inds in indexIter:
            trainErrors[inds] = resultsIterator2.next()

        pool.terminate()

        #Store v fold penalised error
        #In the case that Cv < 0 we use the corrected penalisation
        resultsList = []
        for k in range(len(Cvs)):
            Cv = Cvs[k]

            #If Cv is an array then each value is learning rate beta for the corresponding params
            if type(Cv) == numpy.ndarray:
                tempCv = ((folds - 1)**Cv / (folds**(Cv - 1)))
                logging.debug(
                    "Computing learning rate penalisation with Cv.shape=" +
                    str(tempCv.shape))
                currentPenalties = penalties * tempCv
            else:
                if Cv >= 0:
                    logging.debug("Computing penalisation of Cv=" + str(Cv))
                    currentPenalties = penalties * Cv
                else:
                    logging.debug(
                        "Computing corrected penalisation with sigma=" +
                        str(abs(Cv)))
                    sigma = abs(Cv)
                    dynamicCv = (folds - 1) * (1 - numpy.exp(
                        -sigma * trainErrors)) + float(folds) * numpy.exp(
                            -sigma * trainErrors)
                    currentPenalties = penalties * dynamicCv

            meanErrors = trainErrors + currentPenalties
            learner = self.getBestLearner(meanErrors, paramDict, X, y, idx)
            resultsList.append((learner, trainErrors, currentPenalties))

        return resultsList