示例#1
0
文件: ols_vs_lad.py 项目: 418877216/-
def train_LAD(x, y):
    """
    训练LAD线性回归模型,并返回模型预测值
    """
    X = sm.add_constant(x)
    model = QuantReg(y, X)
    model = model.fit(q=0.5)
    re = model.predict(X)
    return re
示例#2
0
class QuantileRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, q=0.5):
        self.q = q

    def fit(self, X, y):
        self.model_ = QuantReg(y, smapi.add_constant(X))
        self.model_result_ = self.model_.fit(q=self.q)
        return self

    def predict(self, X):
        return self.model_result_.predict(smapi.add_constant(X))
示例#3
0
def calcuSlope(i, LogData, SeqDepth, Genes, Tau):
    if i % round(len(Genes) / 10) == 0:
        print(i / round(len(Genes) / 10) * 10, '%')
    X = Genes[i]
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        validIdx = np.logical_not(np.isnan(
            LogData.loc[X].values)) & (SeqDepth.values > 0)
        mod = QuantReg(LogData.loc[X].values[validIdx],
                       tools.add_constant(np.log(SeqDepth.values[validIdx])))
        # mod = smf.quantreg('response ~ variable',
        #                   pd.DataFrame({'response': LogData.loc[X], 'variable': np.log(SeqDepth)}))
        slope = mod.fit(q=Tau).params[1]
    return slope
示例#4
0
class QuantileRegression:
    """Quantile regression wrapper

    It can work on sklearn pipelines

    Example
    -------
    >>> from sktools import QuantileRegression
    >>> from sklearn.datasets import load_boston
    >>> boston = load_boston()['data']
    >>> y = load_boston()['target']
    >>> qr = QuantileRegression(quantile=0.9)
    >>> qr.fit(boston, y)
    >>> qr.predict(boston)[0:5].round(2)
    array([34.87, 28.98, 34.86, 32.67, 32.52])

    """
    def __init__(self, quantile=0.5, add_intercept=True):

        self.quantile = quantile
        self.add_intercept = add_intercept
        self.regressor = None
        self.regressor_fit = None

    def preprocess(self, X):

        X = X.copy()
        if self.add_intercept:
            X = sm.add_constant(X)
        return X

    def fit(self, X, y):

        X = self.preprocess(X)

        self.regressor = QuantReg(y, X)
        self.regressor_fit = self.regressor.fit(q=self.quantile)

    def predict(self, X, y=None):

        X = self.preprocess(X)

        return self.regressor_fit.predict(X)
示例#5
0
    def run(self):
        """
        Build the POD models.

        Notes
        -----
        This method build the quantile regression model. First the censored data
        are filtered if needed. The Box Cox transformation is performed if it is
        enabled. Then it builds the POD model for given data and computes using
        bootstrap all the defects quantile needed to build the POD model at the
        confidence level.
        """

        # Run the preliminary run of the POD class
        result = self._run(self._inputSample, self._outputSample, self._detection,
                           self._noiseThres, self._saturationThres, self._boxCox,
                           self._censored)

        # get some results
        self._defects = result['inputSample']
        self._signals = result['signals']
        self._detectionBoxCox = result['detectionBoxCox']

        defectsSize = self._defects.getSize()

        # create the quantile regression object
        X = ot.NumericalSample(defectsSize, [1, 0])
        X[:, 1] = self._defects
        self._algoQuantReg = QuantReg(np.array(self._signals), np.array(X))

        # Compute the defect quantile
        defectMax = self._defects.getMax()[0]
        defectList = []
        for probLevel in self._quantile:
            # fit the quantile regression and return the NMF
            model = self._buildModel(1. - probLevel)
            # Solve the model == detectionBoxCox with defects 
            # boundaries = [0, defectMax]
            defectList.append(ot.Brent().solve(model, self._detectionBoxCox,
                                               0, defectMax))
        # create support of the interpolating function including
        # point (0, 0) and point (defectMax, max(quantile))
        xvalue = np.hstack([0, defectList, defectMax])
        yvalue = np.hstack([0., self._quantile, self._quantile.max()])
        interpModel = interp1d(xvalue, yvalue, kind='linear')
        self._PODmodel = ot.PythonFunction(1, 1, interpModel)


        ############ Confidence interval with bootstrap ########################
        # Compute a NsimulationSize defect sizes for all quantiles
        data = ot.NumericalSample(self._size, 2)
        data[:, 0] = self._inputSample
        data[:, 1] = self._outputSample
        # bootstrap of the data
        bootstrapExp = ot.BootstrapExperiment(data)
        # create a numerical sample which contains for all simulations the 
        # defect quantile value. The goal is to compute the QuantilePerComponent
        # of the simulation for each defect quantile (columns)
        self._defectsPerQuantile = ot.NumericalSample(self._simulationSize, self._quantile.size)
        for i in range(self._simulationSize):
            # generate a sample with replacement within data of the same size
            bootstrapData = bootstrapExp.generate()
            # run the preliminary analysis : censore checking and box cox
            result = self._run(bootstrapData[:,0], bootstrapData[:,1], self._detection,
                               self._noiseThres, self._saturationThres,
                               self._boxCox, self._censored)

            # get some results
            defects = result['inputSample']
            signals = result['signals']
            detectionBoxCox = result['detectionBoxCox']
            defectsSize = defects.getSize()

            # new quantile regression algorithm
            X = ot.NumericalSample(defectsSize, [1, 0])
            X[:, 1] = defects
            algoQuantReg = QuantReg(np.array(signals), np.array(X))

            # compute the quantile defects
            defectMax = defects.getMax()[0]
            defectList = []
            for probLevel in self._quantile:
                fit = algoQuantReg.fit(1. - probLevel, max_iter=300, p_tol=1e-2)
                def model(x):
                    X = ot.NumericalPoint([1, x[0]])
                    return ot.NumericalPoint(fit.predict(X))
                model = ot.PythonFunction(1, 1, model)
                # Solve the model == detectionBoxCox with defects 
                # boundaries = [-infinity, defectMax] : it allows negative defects
                # when for small prob level, there is no intersection with
                # the detection threshold for positive defects
                defectList.append(ot.Brent().solve(model, detectionBoxCox,
                                                   -ot.SpecFunc.MaxNumericalScalar,
                                                   defectMax))
            # add the quantile in the numerical sample as the ith simulation
            self._defectsPerQuantile[i, :] = defectList
            if self._verbose:
                updateProgress(i, self._simulationSize, 'Computing defect quantile')
示例#6
0
class ForecastModelQR(ForecastModelBase):
    """
    QR预报模型
    """

    def constructModel(self):
        """
        QR比较特殊,无需构造模型,或者说它构造模型和训练是同时完成的,所以实现均在fit()方法中
        :return:
        """
        pass

    def fit(self):
        optimizedHyperParameters = self.optimizedHyperParameters
        fixedHyperParameters = self.fixedHyperParameters

        kernelName = optimizedHyperParameters["kernelName"]
        trainX, trainY, validationX, validationY = self.dataset.getDataset(2)
        self.model = QuantReg(trainY, trainX)

    def predict(self, validationX=None, isFlatten=False):
        if validationX is None:
            validationX = self.dataset.validationX
        optimizedHyperParameters = self.optimizedHyperParameters
        kernelName = optimizedHyperParameters["kernelName"]
        results = self.model.fit(q=0.5, kernel=kernelName)
        predictions = self.model.predict(params=results.params, exog=validationX)
        if isFlatten:
            predictions = predictions.flatten()
        self.dataset.validationD = predictions
        return predictions

    def getOptimizedHyperParametersRange(self):
        optimizedHyperParametersRange = {
            "kernelName": hp.choice("kernelName", ['epa', 'cos', 'gau', 'par']),
        }
        return optimizedHyperParametersRange

    def getDefaultOptimizedHyperParameters(self):
        optimizedHyperParameters = dict()
        # 核函数名称
        optimizedHyperParameters["kernelName"] = "epa"
        return optimizedHyperParameters

    def getDefaultFixedHyperParameters(self):
        fixedHyperParameters = dict()
        return fixedHyperParameters

    def getProbabilisticResults(self, probabilisticForecastModelParams, validationX=None):
        if validationX is None:
            validationX = self.dataset.validationX
        validSampleNum = validationX.shape[0]
        optimizedHyperParameters = self.optimizedHyperParameters
        kernelName = optimizedHyperParameters["kernelName"]

        # 刚好从0到1步长0.001,也恰好是1001个点
        F = np.arange(0, 1.001, 0.001)
        predictions = np.zeros(shape=(validSampleNum, len(F)))
        for i in range(len(F)):
            q = F[i]
            if 0 < q < 1:
                results = self.model.fit(q=q, kernel=kernelName)
                prediction = self.model.predict(params=results.params, exog=validationX)
                predictions[:, i] = prediction.T
        predictions[:, 0] = 2 * predictions[:, 1] - predictions[:, 2]
        predictions[:, -1] = 2 * predictions[:, -2] - predictions[:, -3]
        predictions.sort(axis=1)
        pdfs = []
        cdfs = []
        for i in range(validSampleNum):
            # 刚好从0到1步长0.001,也恰好是1001个点
            x = predictions[i, :]
            x = self.dataset.reverseLabel(x)
            c = dict()
            c["x"] = x
            c["F"] = F
            cdfs.append(c)

            # 已知概率密度函数PDF去求累计分布函数CDF,这是确定的过程
            # 已知CDF反求PDF,在PDF形式未知的情况下,根据所求方法采用的假设不同得到的PDF不同
            # 用面积定义来求,假设在散点很密的情况下,可以简化为小梯形面积或者小矩形面积,但这个假设不同会导致PDF形式差别很大
            # 也可以根据CDF分布来随机生成很多样本,再采用核密度估计方法也能得到PDF,总之取决于假设

            # 方法1:面积定义来求,假设小矩形,这个过程中推荐方法1
            xNew = np.linspace(x.min(), x.max(), len(x))
            y = MathInterpolateUtils.interp1d(x, F, xNew, kind="slinear")
            f = np.zeros(shape=x.shape)
            for j in range(1, len(f)):
                f[j] = (y[j] - y[j - 1]) / (xNew[j] - xNew[j - 1])
            x = xNew

            # 方法2:面积定义法,假设小梯形
            # f = np.zeros(shape=x.shape)
            # for j in range(1, len(F)):
            #     f[j] = 2 * (F[j] - F[j - 1]) / (x[j] - x[j - 1]) - f[j - 1]

            # 方法3:核密度估计
            # 首先需要针对CDF产生均匀分布的随机数,由于计算过程中分位数已经是均匀分布的了,所以可以直接对对应的x值进行估计
            # 方法3很费时,除了展示个别时段的PDF,整个过程中基本都在用CDF而不是PDF,所以在这个过程中不建议采用方法3
            # 只在专门展示PDF的服务里使用这个方法
            # paramGrid = {'bandwidth': np.arange(0, 5, 0.5)}
            # kde = KernelDensity(kernel='epanechnikov')
            # kdeGrid = GridSearchCV(estimator=kde, param_grid=paramGrid, cv=3)
            # kde = kdeGrid.fit(x.reshape(-1, 1)).best_estimator_
            # logDens = kde.score_samples(x.reshape(-1, 1))
            # f = np.exp(logDens)

            p = dict()
            p["x"] = x
            p["f"] = f
            pdfs.append(p)
        probabilisticResults = {
            "pdfs": np.array(pdfs),
            "cdfs": np.array(cdfs)
        }
        self.dataset.validationP = probabilisticResults
        return probabilisticResults
示例#7
0
class QuantileRegressionPOD(POD):
    """
    Quantile regression based POD.

    **Available constructor:**

    QuantileRegressionPOD(*inputSample, outputSample, detection, noiseThres,
    saturationThres, boxCox*)

    Parameters
    ----------
    inputSample : 2-d sequence of float
        Vector of the defect sizes, of dimension 1.
    outputSample : 2-d sequence of float
        Vector of the signals, of dimension 1.
    detection : float
        Detection value of the signal.
    noiseThres : float
        Value for low censored data. Default is None.
    saturationThres : float
        Value for high censored data. Default is None
    boxCox : bool or float
        Enable or not the Box Cox transformation. If boxCox is a float, the Box
        Cox transformation is enabled with the given value. Default is False.

    Notes
    -----
    This class aims at building the POD based on a quantile regression
    model. The return POD model corresponds with an interpolate function built
    with the defect values computed for the given quantile as parameters. The
    default is 21 quantile values from 0.05 to 0.98. They can be user-defined
    using the method *setQuantile*.

    The confidence level is computed by bootstrap. The POD model at the given
    confidence level is also an interpolate function based on the defect quantile
    value computed at the given confidence level.

    The computeDetectionSize method calls the real quantile regression
    at the given probability level.

    A progress bar is shown if the verbosity is enabled. It can be disabled using
    the method *setVerbose*.
    """

    def __init__(self, inputSample=None, outputSample=None, detection=None, noiseThres=None,
                 saturationThres=None, boxCox=False):

        self._quantile = np.linspace(0.05, 0.98, 21)
        self._verbose = True

        # initialize the POD class
        super(QuantileRegressionPOD, self).__init__(inputSample, outputSample,
                                 detection, noiseThres, saturationThres, boxCox)
        # inherited attributes
        # self._simulationSize
        # self._detection
        # self._inputSample
        # self._outputSample
        # self._noiseThres
        # self._saturationThres        
        # self._lambdaBoxCox
        # self._boxCox
        # self._size
        # self._dim
        # self._censored
        
        # assertion input dimension is 1
        assert (self._dim == 1), "Dimension of inputSample must be 1."

        if self._censored:
            logging.info('Censored data are not taken into account : the quantile ' + \
                         'regression model is only performed on filtered data.')


    def run(self):
        """
        Build the POD models.

        Notes
        -----
        This method build the quantile regression model. First the censored data
        are filtered if needed. The Box Cox transformation is performed if it is
        enabled. Then it builds the POD model for given data and computes using
        bootstrap all the defects quantile needed to build the POD model at the
        confidence level.
        """

        # Run the preliminary run of the POD class
        result = self._run(self._inputSample, self._outputSample, self._detection,
                           self._noiseThres, self._saturationThres, self._boxCox,
                           self._censored)

        # get some results
        self._defects = result['inputSample']
        self._signals = result['signals']
        self._detectionBoxCox = result['detectionBoxCox']

        defectsSize = self._defects.getSize()

        # create the quantile regression object
        X = ot.NumericalSample(defectsSize, [1, 0])
        X[:, 1] = self._defects
        self._algoQuantReg = QuantReg(np.array(self._signals), np.array(X))

        # Compute the defect quantile
        defectMax = self._defects.getMax()[0]
        defectList = []
        for probLevel in self._quantile:
            # fit the quantile regression and return the NMF
            model = self._buildModel(1. - probLevel)
            # Solve the model == detectionBoxCox with defects 
            # boundaries = [0, defectMax]
            defectList.append(ot.Brent().solve(model, self._detectionBoxCox,
                                               0, defectMax))
        # create support of the interpolating function including
        # point (0, 0) and point (defectMax, max(quantile))
        xvalue = np.hstack([0, defectList, defectMax])
        yvalue = np.hstack([0., self._quantile, self._quantile.max()])
        interpModel = interp1d(xvalue, yvalue, kind='linear')
        self._PODmodel = ot.PythonFunction(1, 1, interpModel)


        ############ Confidence interval with bootstrap ########################
        # Compute a NsimulationSize defect sizes for all quantiles
        data = ot.NumericalSample(self._size, 2)
        data[:, 0] = self._inputSample
        data[:, 1] = self._outputSample
        # bootstrap of the data
        bootstrapExp = ot.BootstrapExperiment(data)
        # create a numerical sample which contains for all simulations the 
        # defect quantile value. The goal is to compute the QuantilePerComponent
        # of the simulation for each defect quantile (columns)
        self._defectsPerQuantile = ot.NumericalSample(self._simulationSize, self._quantile.size)
        for i in range(self._simulationSize):
            # generate a sample with replacement within data of the same size
            bootstrapData = bootstrapExp.generate()
            # run the preliminary analysis : censore checking and box cox
            result = self._run(bootstrapData[:,0], bootstrapData[:,1], self._detection,
                               self._noiseThres, self._saturationThres,
                               self._boxCox, self._censored)

            # get some results
            defects = result['inputSample']
            signals = result['signals']
            detectionBoxCox = result['detectionBoxCox']
            defectsSize = defects.getSize()

            # new quantile regression algorithm
            X = ot.NumericalSample(defectsSize, [1, 0])
            X[:, 1] = defects
            algoQuantReg = QuantReg(np.array(signals), np.array(X))

            # compute the quantile defects
            defectMax = defects.getMax()[0]
            defectList = []
            for probLevel in self._quantile:
                fit = algoQuantReg.fit(1. - probLevel, max_iter=300, p_tol=1e-2)
                def model(x):
                    X = ot.NumericalPoint([1, x[0]])
                    return ot.NumericalPoint(fit.predict(X))
                model = ot.PythonFunction(1, 1, model)
                # Solve the model == detectionBoxCox with defects 
                # boundaries = [-infinity, defectMax] : it allows negative defects
                # when for small prob level, there is no intersection with
                # the detection threshold for positive defects
                defectList.append(ot.Brent().solve(model, detectionBoxCox,
                                                   -ot.SpecFunc.MaxNumericalScalar,
                                                   defectMax))
            # add the quantile in the numerical sample as the ith simulation
            self._defectsPerQuantile[i, :] = defectList
            if self._verbose:
                updateProgress(i, self._simulationSize, 'Computing defect quantile')

    def getPODModel(self):
        """
        Accessor to the POD model.

        Returns
        -------
        PODModel : :py:class:`openturns.NumericalMathFunction`
            The function which computes the probability of detection for a given
            defect value.
        """
        return self._PODmodel

    def getPODCLModel(self, confidenceLevel=0.95):
        """
        Accessor to the POD model at a given confidence level.

        Parameters
        ----------
        confidenceLevel : float
            The confidence level the POD must be computed. Default is 0.95

        Returns
        -------
        PODModelCl : :py:class:`openturns.NumericalMathFunction`
            The function which computes the probability of detection for a given
            defect value at the confidence level given as parameter.
        """
        # Compute the quantile at the given confidence level for each
        # defect quantile and build the interpolate function.
        defectsQuantile = self._defectsPerQuantile.computeQuantilePerComponent(
                                                                confidenceLevel)

        xvalue = np.hstack([0, np.array(defectsQuantile), self._defects.getMax()[0]])
        yvalue = np.hstack([0., self._quantile, self._quantile.max()])
        interpModel = interp1d(xvalue, yvalue, kind='linear')
        PODmodelCl = ot.PythonFunction(1, 1, interpModel)

        return PODmodelCl

    def getR2(self, quantile):
        """
        Accessor to the pseudo R2 value.
        
        Parameters
        ----------
        quantile : float
            The quantile value for which the regression is performed.       

        Returns
        -------
        R2 : float
            The pseudo R2 value.
        """
        return self._algoQuantReg.fit(quantile).prsquared

    def getQuantile(self):
        """
        Accessor to the quantile list for the regression.
        """
        return self._quantile

    def setQuantile(self, quantile):
        """
        Accessor to the quantile list for the regression.

        Parameters
        ----------
        quantile : sequence of float
            The quantile value for which the regression is performed and the 
            corresponding defect size is computed.
        """
        quantile = np.hstack(np.array(quantile))
        quantile.sort()
        if quantile.max() >= 1 or quantile.min() <= 0:
            raise ValueError('Quantile values must range between ]0, 1[.')
        self._quantile = quantile

    @DocInherit # decorator to inherit the docstring from POD class
    @keepingArgs # decorator to keep the real signature
    def computeDetectionSize(self, probabilityLevel, confidenceLevel=None):
        defectMin = self._defects.getMin()[0]
        defectMax = self._defects.getMax()[0]
        # compute 'a90'
        model = self._buildModel(1. - probabilityLevel)
        try:
            detectionSize = ot.NumericalPointWithDescription(1, ot.Brent().solve(
                                        model, self._detectionBoxCox, defectMin, defectMax))
        except:
            raise Exception('The POD model does not contain, for the given ' + \
                             'defect interval, the wanted probability level.')
        description = ['a'+str(int(probabilityLevel*100))]

        # compute 'a90_95'
        if confidenceLevel is not None:
            modelCl = self.getPODCLModel(confidenceLevel)
            if not (modelCl([defectMin])[0] <= probabilityLevel <= modelCl([defectMax])[0]):
                raise Exception('The POD model at the confidence level does not '+\
                                'contain, for the given defect interval, the '+\
                                'wanted probability level.')
            detectionSize.add(ot.Brent().solve(modelCl,
                                               probabilityLevel,
                                               defectMin, defectMax))
            description.append('a'+str(int(probabilityLevel*100))+'/'\
                                                +str(int(confidenceLevel*100)))
        # add description to the NumericalPoint
        detectionSize.setDescription(description)
        return detectionSize

    @DocInherit # decorator to inherit the docstring from POD class
    @keepingArgs # decorator to keep the real signature
    def drawPOD(self, probabilityLevel=None, confidenceLevel=None, defectMin=None,
                defectMax=None, nbPt=100, name=None):

        if defectMin is None:
            defectMin = np.min(self._defects)
        else:
            if defectMin < np.min(self._defects):
                raise ValueError('DefectMin must be greater than the minimum ' + \
                                 'of the given defect sizes.')
            if defectMin > np.max(self._defects):
                raise ValueError('DefectMin must be lower than the maximum ' + \
                                 'of the given defect sizes.')
        if defectMax is None:
            defectMax = np.max(self._defects)
        else:
            if defectMax > np.max(self._defects):
                raise ValueError('DefectMax must be lower than the maximum ' + \
                                 'of the given defect sizes.')
            if defectMax < np.min(self._defects):
                raise ValueError('DefectMax must be greater than the minimum ' + \
                                 'of the given defect sizes.')

        if confidenceLevel is None:
            fig, ax = self._drawPOD(self.getPODModel(), None,
                                probabilityLevel, confidenceLevel, defectMin,
                                defectMax, nbPt, name)
        elif confidenceLevel is not None:
            fig, ax = self._drawPOD(self.getPODModel(), self.getPODCLModel(confidenceLevel),
                    probabilityLevel, confidenceLevel, defectMin,
                    defectMax, nbPt, name)

        ax.set_title('POD - Quantile regression model')
        if name is not None:
            fig.savefig(name, bbox_inches='tight', transparent=True)

        return fig, ax

    def drawLinearModel(self, probabilityLevel, name=None):
        """
        Draw the quantile regression prediction versus the true data.

        Parameters
        ----------
        probabilityLevel : float
            The probability level for which the quantile regression is performed
        name : string
            name of the figure to be saved with *transparent* option sets to True
            and *bbox_inches='tight'*. It can be only the file name or the 
            full path name. Default is None.

        Returns
        -------
        fig : `matplotlib.figure <http://matplotlib.org/api/figure_api.html>`_
            Matplotlib figure object.
        ax : `matplotlib.axes <http://matplotlib.org/api/axes_api.html>`_
            Matplotlib axes object.
        """

        model = self._algoQuantReg.fit(1. - probabilityLevel)

        defects = self._defects
        signals = self._signals
        fittedSignals = model.fittedvalues

        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(defects, signals, 'b.', label='Data', ms=9)
        ax.plot(defects, fittedSignals, 'r-', label='Linear regression model')
        ax.set_xlabel('Defects')
        ax.set_ylabel('Signals')
        ax.set_title('Quantile regression model at level (1 - ' + \
                                        str(probabilityLevel) + ')')
        ax.grid()
        ax.legend(loc='upper left')

        if name is not None:
            fig.savefig(name, bbox_inches='tight', transparent=True)

        return fig, ax

    def getVerbose(self):
        """
        Accessor to the verbosity.

        Returns
        -------
        verbose : bool
            Enable or disable the verbosity. Default is True. 
        """
        return self._verbose

    def setVerbose(self, verbose):
        """
        Accessor to the verbosity.

        Parameters
        ----------
        verbose : bool
            Enable or disable the verbosity.
        """
        if type(verbose) is not bool:
            raise TypeError('The parameter is not a bool.')
        else:
            self._verbose = verbose

    def _buildModel(self, probabilityLevel):
        """
        Build the NumericalMathFunction at the given probabilityLevel. It is
        used in the run and in computeDetectionSize in order to do not use the
        interpolate function.
        """
        fit = self._algoQuantReg.fit(probabilityLevel, max_iter=300, p_tol=1e-2)
        def model(x):
            X = ot.NumericalPoint([1, x[0]])
            return ot.NumericalPoint(fit.predict(X))
        return ot.PythonFunction(1, 1, model)
示例#8
0
def train_predict_stacking_linear_regression(df_learning, df_prod,
                                             l_tuple_strategy_normalised):
    for quantile in constants.LIST_QUANTILE:
        to_keep = []
        for strategy, normalize_by in l_tuple_strategy_normalised:
            str_normalized = '_normed_by_' + normalize_by if normalize_by is not None else ''
            to_keep.append('{}{}_quantile_{:.3f}'.format(
                strategy, str_normalized, quantile))

        # Remove NA columns
        to_keep = df_learning[to_keep].notnull().all()
        to_keep = to_keep[to_keep].index.tolist()

        # We need to remove constants columns from the sampled data
        df_learning_weighted = df_learning.sample(10000,
                                                  weights='weight',
                                                  replace=True,
                                                  random_state=1)

        # Remove constants columns
        cols_constants = df_learning_weighted[to_keep].std() == 0
        cols_constants = cols_constants[cols_constants].index.tolist()
        for col in cols_constants:
            to_keep.remove(col)

        # # Remove correlated features
        # # Create correlation matrix
        # corr_matrix = df_learning[to_keep].corr().abs().fillna(1)

        # # Select upper triangle of correlation matrix
        # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # # Find index of feature columns with correlation greater than 0.95
        # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
        # to_keep.remove(to_drop)

        # Drop duplicates columns
        def getDuplicateColumns(df):
            '''
            Get a list of duplicate columns.
            It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
            :param df: Dataframe object
            :return: List of columns whose contents are duplicates.
            '''
            duplicateColumnNames = set()
            # Iterate over all the columns in dataframe
            for x in range(df.shape[1]):
                # Select column at xth index.
                col = df.iloc[:, x]
                # Iterate over all the columns in DataFrame from (x+1)th index till end
                for y in range(x + 1, df.shape[1]):
                    # Select column at yth index.
                    otherCol = df.iloc[:, y]
                    # Check if two columns at x 7 y index are equal
                    if col.equals(otherCol):
                        duplicateColumnNames.add(df.columns.values[y])

            return list(duplicateColumnNames)

        cols_duplicate = getDuplicateColumns(df_learning_weighted[to_keep])
        for cols in cols_duplicate:
            to_keep.remove(cols)

        # to_keep = df_learning_weighted[to_keep].T.drop_duplicates().T.columns  # Not efficient but ok

        X_learning_weighted = df_learning_weighted[to_keep].fillna(0)
        X_learning = df_learning[to_keep].fillna(0)
        X_prod = df_prod[to_keep].fillna(0)

        y_learning_weighted = df_learning_weighted['sales']
        # weight_learning = df_learning['weight']
        if X_learning_weighted.nunique().max() != 1:
            linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
            linear_model = linear_model.fit(q=quantile)
            # print(linear_model.summary())
            df_learning['quantile_{:.3f}'.format(
                quantile)] = linear_model.predict(X_learning)
            df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
                X_prod)
        else:
            df_learning['quantile_{:.3f}'.format(quantile)] = 0
            df_prod['quantile_{:.3f}'.format(quantile)] = 0

    return df_learning, df_prod
示例#9
0
 def fit(self, X, y=None):
     with warnings.catch_warnings():  # Deprecation warning disabled
         warnings.simplefilter("ignore")
         med_reg = QuantReg(y, X)
         self.coef_ = med_reg.fit(q=self.q).params
reg2 = HuberRegressor(epsilon = 1)

model2 = reg2.fit(x, y)
y_pred2 = model2.predict(x_test)

"""L1"""

dfx = pd.DataFrame(x, columns = ['x'])
dfy = pd.DataFrame(y, columns = ['y'])
exog = sm.add_constant(dfx['x'])
endog = dfy['y']
dft = pd.DataFrame(x_test, columns = ['test'])

qrmodel = QuantReg(endog, exog)
result = qrmodel.fit(q=0.5)

ypred_qr = np.dot(dft, result.params[1]) + result.params[0] #results.predict(dft)

"""Student-t"""

tmodel = TLinearModel(endog, exog)
results = tmodel.fit(df=0.6)

ypred_t = np.dot(dft, results.params[1]) + results.params[0] #results.predict(dft)

"""Plot"""

plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.yticks(np.arange(ymin, ymax, 1.0))
示例#11
0
def train_predict_lgb_point_to_uncertainity(df_learning, df_prod,
                                            verbose_eval):
    """
    Args :
    - df_learning
    - df_prod

    Returns:
    - df_valid with quantile prediction and pinball loss
    - df_prod with quantile prediction
    """
    (
        df_learning,
        df_train,
        df_valid,
        df_valid_oof,
        X_learning,
        X_train,
        X_valid,
        X_valid_oof,
        X_prod,
        y_learning,
        y_train,
        y_valid,
        y_valid_oof,
        weight_learning,
        weight_train,
        weight_valid,
        weight_valid_oof,
        lgb_learning,
        lgb_train,
        lgb_valid,
    ) = prepare_data(df_learning, df_prod)

    param, num_boost_round, early_stopping_rounds = get_lgb_params(
        objective='regression', dataset_nrows=df_learning.shape[0])
    col_predict = 'pred'

    df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb(
        df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod,
        X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param,
        num_boost_round, early_stopping_rounds, verbose_eval, col_predict)

    df_learning_weighted = pd.concat([df_valid_oof,
                                      df_valid_pred]).sample(100000,
                                                             weights='weight',
                                                             replace=True,
                                                             random_state=1)
    # If we fit QuantReg on overfitted prediction, QuantReg underestimate the security  needed
    # df_learning_weighted = df_learning.sample(100000, weights='weight', replace=True, random_state=1)

    to_keep = ['pred', 'horizon']
    X_learning_weighted = df_learning_weighted[to_keep]
    X_learning = df_learning[to_keep]
    X_valid_oof = df_valid_oof[to_keep]
    X_prod = df_prod[to_keep]
    # y_learning = df_learning['sales']
    y_learning_weighted = df_learning_weighted['sales']

    for quantile in constants.LIST_QUANTILE:
        # QuantReg do not have weight parameter, so we mannualy reweight our datasets
        linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
        linear_model = linear_model.fit(q=quantile)
        # print(linear_model.summary())
        df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_learning)
        df_valid_oof['quantile_{:.3f}'.format(
            quantile)] = linear_model.predict(X_valid_oof)
        df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_prod)

    df_valid_oof = prep.compute_pinball(df_valid_oof)

    return df_valid_oof, df_prod
示例#12
0
def quantile_regression_epsilon(perfkerneldict, proposalkerneldict):
    """
    function that does the quantile regression 
    for getting epsilon max
    """
    target = abs(np.log(proposalkerneldict['target_probability']))
    # case of mala and rw
    if len(perfkerneldict['energy'].shape) == 1:
        energy = perfkerneldict['energy']
        energy_quant_reg = energy
    # case of hmc
    else:
        energy = -perfkerneldict['energy'][:, 1:] + perfkerneldict[
            'energy'][:, :1]
        energy_quant_reg = energy[:, -1]
    epsilon = perfkerneldict['epsilon'].flatten()
    #import ipdb; ipdb.set_trace()
    if np.isnan(energy_quant_reg).any() or np.isinf(energy_quant_reg).any():
        #import ipdb; ipdb.set_trace()
        selector = np.isfinite(energy_quant_reg)
        energy_quant_reg = energy_quant_reg[selector]
        epsilon = epsilon[selector]
        print('discard nan in energy')
    try:
        max_selector = abs(energy_quant_reg) < abs(
            np.log(proposalkerneldict['target_probability']))
        epsilon_max_simple = max(epsilon[max_selector])
    except:
        try:
            epsilon_max_simple = max(epsilon[np.argmax(energy_quant_reg)])
        except:
            epsilon_max_simple = max(epsilon)
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")

            energy_quant_reg_clipped = np.clip(abs(energy_quant_reg), 0, 10**6)
            quant_reg = QuantReg(energy_quant_reg_clipped, epsilon**2)
            res_median = quant_reg.fit()
            res_lower = quant_reg.fit(0.5)
            #res_upper = quant_reg.fit(0.75)
            epsilon_max_quant = (target / res_lower.params)**0.5
            epsilon_next = (target / res_median.params)**0.5
    except:
        import ipdb
        ipdb.set_trace()

    #import ipdb; ipdb.set_trace()

    #epsilon_min = (target/res_upper.params)**0.5
    epsilon_max = np.max([epsilon_max_quant, epsilon_max_simple])
    if np.isinf(epsilon_next):
        epsilon_next = np.mean(epsilon)
        #import ipdb; ipdb.set_trace()

    if False:
        #import ipdb; ipdb.set_trace()
        from matplotlib import pyplot as plt
        import seaborn as sns
        plt.rc('font', size=20)
        sns.set_style("whitegrid")

        plt.scatter(y=energy_quant_reg, x=epsilon, color='blue')
        plt.xlabel('epsilon', fontsize=14)
        plt.ylabel('Variation energy', fontsize=14)
        #plt.plot(epsilon, res_median.params*(epsilon**2).flatten(), color='red')
        #plt.plot(epsilon, res_lower.params*(epsilon**2).flatten(), color='grey')
        #plt.scatter(y=res_lower.params*(epsilon_current**2).flatten(), x = (epsilon_current).flatten(), color='grey')

        #plt.title('Variation in energy according to epsilon')
        plt.savefig('energy_temp_%s.pdf' % (perfkerneldict['temp']))
        #plt.tight_layout(pad=1.2)
        plt.clf()

    #import matplotlib.pyplot as plt
    #import seaborn as sns
    #import ipdb; ipdb.set_trace()
    #plt.scatter(y=energy_quant_reg, x=perfkerneldict['L'])
    return epsilon_next, epsilon_max
示例#13
0
scaled = pd.DataFrame(StandardScaler().fit_transform(orig.copy().values),
                      columns=orig.columns)
print(orig.shape)
assert orig.shape == scaled.shape


# In[5]:

model = QuantReg(response, orig)


# In[6]:

for q in np.linspace(0.05, 0.95, 10):
    print(q)
    print(model.fit(q=q).summary())
    print()
    print()


# In[ ]:




# In[ ]:




# In[ ]:
示例#14
0
#カーネル密度を推計したい時期
data_est = data.loc[est_sdate:est_edate, x_name]

##################
### 分位点回帰

#model
model = QuantReg(data_y, data_x)

#分位点回帰の刻み幅
step = 0.01
n = int(1 / step) - 1

#係数行列
coeff = np.ones((n, len(data_x.T)))
for i in range(n):
    res = model.fit(q=step * (i + 1))
    coeff[i, :] = np.array(list(res.params)).reshape(1, -1)

#########################
### カーネル密度分布の推計

#疑似逆累積分布関数の作成
est_values = np.dot(coeff, np.array(data_est).T)

for i in range(len(est_values.T)):
    delta = relativedelta(months=i)
    sns.kdeplot(est_values[:, i], kernel="epa", label=est_sdate + delta)
plt.show()
plt.close()
def update_revenue_forecast(historicals, method, fcast_index, focus_scenario,
                            p, d, q, P, D, Q):
    historicals = pd.DataFrame(historicals)
    historicals['DT_FIM_EXERC'] = pd.to_datetime(historicals['DT_FIM_EXERC'])
    models = {}

    # Revenue time series model
    data = historicals.set_index('DT_FIM_EXERC').asfreq('Q')
    y = data['Revenue']
    # Transform
    if fcast_index != '':
        idx = data[fcast_index.upper()]
        y = y / idx * idx.iloc[-1]
    y = np.log(y)

    # Create forecast model
    if method == 'ets':
        rev_model = ExponentialSmoothing(y,
                                         trend=True,
                                         damped_trend=True,
                                         seasonal=4)
    elif method == 'arima':
        rev_model = SARIMAX(y,
                            order=(p, d, q),
                            seasonal_order=(P, D, Q, 4),
                            trend='c')
    else:
        return {}
    rev_results = rev_model.fit()
    models['revenue'] = {
        'Params': rev_results.params,
        'diag': {
            'In-sample RMSE': np.sqrt(rev_results.mse),
            'In-sample MAE': rev_results.mae,
            'Ljung-Box': rev_results.test_serial_correlation('ljungbox')[0, 0,
                                                                         -1],
            'log-Likelihood': rev_results.llf,
            'AICc': rev_results.aicc,
            'BIC': rev_results.bic
        }
    }
    # Cross validation
    foldsize = 1
    nfolds = round(y.shape[0] / (4 * foldsize)) - 1
    cv_errors = []
    for fold in range(nfolds, 0, -1):
        train_subset = y.iloc[:-(fold + 2) * (4 * foldsize)]
        valid_subset = y.iloc[-(fold + 2) * (4 * foldsize):-(fold + 1) *
                              (4 * foldsize)]
        if train_subset.shape[0] < 16:
            continue
        fcasts = (rev_model.clone(np.log(train_subset)).fit().forecast(
            valid_subset.shape[0]))
        cv_errors = np.append(cv_errors, fcasts - np.log(valid_subset))
    if len(cv_errors) > 4:
        models['revenue']['diag']['CV RMSE'] = np.sqrt(
            np.mean(np.array(cv_errors)**2))
        models['revenue']['diag']['CV MAE'] = np.mean(np.abs(cv_errors))

    # Generate simulated forecasts
    nsim = 100
    horiz = int(np.sum(focus['scenario'] == focus_scenario))
    forecasts = (pd.DataFrame({
        'y': rev_results.forecast(horiz),
        'group': 'forecast',
        'variable_1': ''
    }).reset_index())
    simulations = (rev_results.simulate(
        horiz, repetitions=nsim,
        anchor=data.shape[0]).reset_index().melt('index', value_name='y').drop(
            columns='variable_0').assign(group='simulation'))

    simulations = (pd.concat(
        [simulations,
         forecasts]).reset_index(drop=True).rename(columns={
             'variable_1': 'iteration',
             'index': 'DT_FIM_EXERC'
         }).pipe(add_quarters))
    simulations['Revenue'] = np.exp(simulations['y'])
    if fcast_index != '':
        simulations = simulations.merge(
            focus[['DT_FIM_EXERC',
                   fcast_index.upper()]][focus['scenario'] == focus_scenario],
            on="DT_FIM_EXERC",
            how="left")
        simulations['Revenue'] = simulations['Revenue'] \
            * simulations[fcast_index.upper()] \
            / data[fcast_index.upper()].iloc[-1]

    simulations['RevenueGrowth'] = 100 * (
        simulations['Revenue'] /
        simulations.groupby('iteration')['Revenue'].shift(4) - 1)
    simulations.loc[simulations['RevenueGrowth'].isna(), 'RevenueGrowth'] = \
        np.reshape(
            100 * (
                np.reshape(
                    simulations['Revenue'][simulations['RevenueGrowth'].isna()].values,
                    (nsim + 1, 4)) /
                historicals['Revenue'].tail(4).values - 1
                ),
            ((nsim + 1) * 4)
        )

    # Expenses regression model
    historicals['logRevenue'] = np.log(historicals['Revenue'])
    exog = historicals[['logRevenue', 'Q1', 'Q2', 'Q3', 'Q4']]

    opex_model = QuantReg(np.log(historicals['Opex']), exog)
    opex_results = opex_model.fit(q=0.5)
    opex_coefs = opex_results.params
    rmse = np.mean(opex_results.resid**2)**.5

    models['opex'] = {
        'Params': opex_results.params,
        'diag': {
            'In-sample RMSE': np.sqrt(np.mean(opex_results.resid)**2),
            'In-sample MAE': np.mean(np.abs(opex_results.resid)),
            #'Ljung-Box': opex_results.test_serial_correlation('ljungbox')[0, 0, -1],
            #'log-Likelihood': opex_results.llf,
            #'AICc': opex_results.aicc,
            #'BIC': opex_results.bic
        }
    }

    # Simulations
    simulations['Opex'] = np.exp(
        opex_coefs[0] * np.log(simulations['Revenue']) +
        opex_coefs[1] * simulations['Q1'] + opex_coefs[2] * simulations['Q2'] +
        opex_coefs[3] * simulations['Q3'] + opex_coefs[4] * simulations['Q4'] +
        np.random.normal(0, rmse, simulations.shape[0]) *
        (simulations['group'] == 'simulation'))
    simulations['EBIT'] = simulations['Revenue'] - simulations['Opex']
    simulations[
        'EBITMargin'] = 100 * simulations['EBIT'] / simulations['Revenue']
    simulations['Taxes'] = simulations['EBIT'] * .34
    simulations['NOPAT'] = simulations['EBIT'] - simulations['Taxes']

    simulations = pd.concat(
        [historicals.assign(group='historicals', iteration=''), simulations])

    return simulations.to_dict('records'), models
示例#16
0
def train_predict_lgb_tweedie(df_learning, df_prod, verbose_eval=75):
    """
    Args :
    - df_learning
    - df_prod

    Returns:
    - df_valid with quantile prediction and pinball loss
    - df_prod with quantile prediction
    """
    (
        df_learning,
        df_train,
        df_valid,
        df_valid_oof,
        X_learning,
        X_train,
        X_valid,
        X_valid_oof,
        X_prod,
        y_learning,
        y_train,
        y_valid,
        y_valid_oof,
        weight_learning,
        weight_train,
        weight_valid,
        weight_valid_oof,
        lgb_learning,
        lgb_train,
        lgb_valid,
    ) = prepare_data(df_learning, df_prod)

    param, num_boost_round, early_stopping_rounds = get_lgb_params(
        objective='tweedie', dataset_nrows=df_learning.shape[0])
    col_predict = 'pred'

    df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb(
        df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod,
        X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param,
        num_boost_round, early_stopping_rounds, verbose_eval, col_predict)

    from statsmodels.regression.quantile_regression import QuantReg

    df_learning_weighted = df_learning.sample(100000,
                                              weights='weight',
                                              replace=True)

    to_keep = ['pred', 'horizon']
    X_learning_weighted = df_learning_weighted[to_keep]
    X_learning = df_learning[to_keep]
    X_valid_oof = df_valid_oof[to_keep]
    X_prod = df_prod[to_keep]
    # y_learning = df_learning['sales']
    y_learning_weighted = df_learning_weighted['sales']

    for quantile in constants.LIST_QUANTILE:
        # QuantReg do not have weight parameter, so we mannualy reweight our datasets
        linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
        linear_model = linear_model.fit(q=quantile)
        # print(linear_model.summary())
        df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_learning)
        df_valid_oof['quantile_{:.3f}'.format(
            quantile)] = linear_model.predict(X_valid_oof)
        df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_prod)

    df_valid_oof = prep.compute_pinball(df_valid_oof)

    return df_valid_oof, df_prod
示例#17
0
    def fit(self,X,*args,**kwargs):
        
        """
        Fit a projection pursuit dimension reduction model. 
        
        Required input argument: X data as matrix or data frame 
        
        Optinal input arguments: 
            
            arg or kwarg:
            y data as vector or 1D matrix
            
            kwargs: 
            h, int: option to overrule class's n_components parameter in fit. 
                Convenient command line, yet should not be used in automated 
                loops, e.g. cross-validation.
                
            dmetric, str: distance metric used internally. Defaults to 'euclidean'
            
            mixing, bool: to estimate mixing matrix (only relevant for ICA)
            
            Further parameters to the regression methods can be passed on 
            here as well as kwargs, e.g. quantile=0.8 for quantile regression. 
            
            kwargs only relevant if y specified: 
        
        """

        # Collect optional fit arguments
        biascorr = kwargs.pop('biascorr',False)
            
        if 'h' not in kwargs:
            h = self.n_components
        else:
            h = kwargs.pop('h')
            self.n_components = h
            
        if 'dmetric' not in kwargs:
            dmetric = 'euclidean'
        else:
            dmetric = kwargs.get('dmetric')
            
        if 'mixing' not in kwargs:
            mixing = False
        else:
            mixing = kwargs.get('mixing')
            
        if 'y' not in kwargs:
            na = len(args)
            if na > 0: #Use of *args makes it sklearn consistent
                flag = 'two-block'
                y = args[0]
            else:
                flag = 'one-block'
                y = 0 # to allow calls with 'y=y' in spit of no real y argument present
        else:
            flag = 'two-block'
            y = kwargs.get('y')
                            
            if 'quantile' not in kwargs:
                quantile = .5
            else:
                quantile = kwargs.get('quantile')
                
            if self.regopt == 'robust':
            
                if 'fun' not in kwargs:
                    fun = 'Hampel'
                else:
                    fun = kwargs.get('fun')
                
                if 'probp1' not in kwargs:
                    probp1 = 0.95
                else:
                    probp1 = kwargs.get('probp1')
                
                if 'probp2' not in kwargs:
                    probp2 = 0.975
                else:
                    probp2 = kwargs.get('probp2')
                
                if 'probp3' not in kwargs:
                    probp3 = 0.99
                else:
                    probp3 = kwargs.get('probp3')

            
        if self.projection_index == dicomo:
            
            if self.pi_arguments['mode'] in ('M3','cos','c*k'):
            
                if 'option' not in kwargs:
                    option = 1
                else:
                    option = kwargs.get('option')
                
                if option > 3:
                    print('Option value >3 will compute results, but meaning may be questionable')
                
        # Initiate projection index    
        self.most = self.projection_index(**self.pi_arguments)         
        
        # Initiate some parameters and data frames
        if self.copy:
            X0 = copy.deepcopy(X)
            self.X0 = X0
        else:
            X0 = X        
        X = convert_X_input(X0)    
        n,p = X0.shape 
        trimming = self.trimming
        
        # Check dimensions 
        if h > min(n,p):
            raise(MyException('number of components cannot exceed number of samples'))
            
        if (self.projection_index == dicomo and self.pi_arguments['mode'] == 'kurt' and self.whiten_data==False):
            warnings.warn('Whitening step is recommended for ICA')
            
        # Pre-processing adjustment if whitening
        if self.whiten_data:
            self.center_data = True
            self.scale_data = False
            self.compression = False
            print('All results produced are for whitened data')
        
        # Centring and scaling
        if self.scale_data:
            if self.center=='mean':
                scale = 'std'
            elif ((self.center=='median')|(self.center=='l1median')):
                scale = 'mad' 
        else:
            scale = 'None'
            warnings.warn('Without scaling, convergence to optima is not given')
            
         # Data Compression for flat tables if required                
        if ((p>n) and self.compression):
            V,S,U = np.linalg.svd(X.T,full_matrices=False)
            X = np.matmul(U.T,np.diag(S))
            n,p = X.shape
            
            if (srs.mad(X)==0).any(): 
                warnings.warn('Due to low scales in data, compression would induce zero scales.' 
                              + '\n' + 'Proceeding without compression.')
                dimensions = False
                if copy:
                    X = copy.deepcopy(X0)
                else:
                    X = X0
            else:
                dimensions = True
        else:
            dimensions = False
        
        # Initiate centring object and scale X data 
        centring = VersatileScaler(center=self.center,scale=scale,trimming=trimming)      
  
        if self.center_data:
            Xs = centring.fit_transform(X)
            mX = centring.col_loc_
            sX = centring.col_sca_
        else:
            Xs = X
            mX = np.zeros((1,p))
            sX = np.ones((1,p))

        fit_arguments = {}
            
        # Data whitening (best practice for ICA)
        if self.whiten_data:
            V,S,U = np.linalg.svd(Xs.T,full_matrices=False)
            del U
            K = (V/S)[:,:p]
            del V,S
            Xs = np.matmul(Xs, K)
            Xs *= np.sqrt(p)
        
        # Presently, X and y need to be matrices 
        # Will be changed to use regular np.ndarray
        Xs = np.matrix(Xs)

        # Pre-process y data when available 
        if flag != 'one-block':
            
            ny = y.shape[0]
            y = convert_y_input(y)
            if len(y.shape) < 2:
                y = np.matrix(y).reshape((ny,1))
            # py = y.shape[1]
            if ny != n:
                raise(MyException('X and y number of rows must agree'))
            if self.copy:
                y0 = copy.deepcopy(y)
                self.y0 = y0
                
            if self.center_data:
                ys = centring.fit_transform(y)
                my = centring.col_loc_
                sy = centring.col_sca_ 
            else:
                ys = y
                my = 0
                sy = 1
            ys = np.matrix(ys).astype('float64')
        
        else:
            ys = None
                

        # Initializing output matrices
        W = np.zeros((p,h))
        T = np.zeros((n,h))
        P = np.zeros((p,h))
        B = np.zeros((p,h))
        R = np.zeros((p,h))
        B_scaled = np.zeros((p,h))
        C = np.zeros((h,1))
        Xev = np.zeros((h,1))
        assovec = np.zeros((h,1))
        Maxobjf = np.zeros((h,1))

        # Initialize deflation matrices 
        E = copy.deepcopy(Xs)
        f = ys

        bi = np.zeros((p,1))
        
        opt_args = { 
                    'alpha': self.alpha,
                    'trimming': self.trimming,
                    'biascorr': biascorr, 
                    'dmetric' : 'euclidean',
                    }
        
        if self.optimizer=='grid':
            # Define grid optimization ranges
            if 'ndir' not in self.optimizer_options:
                self.optimizer_options['ndir'] = 1000
            optrange = np.sign(self.optrange)
            optmax = self.optrange[1]
            stop0s = np.arcsin(optrange[0])
            stop1s = np.arcsin(optrange[1])
            stop1c = np.arccos(optrange[0])
            stop0c = np.arccos(optrange[1])
            anglestart = max(stop0c,stop0s)
            anglestop = max(stop1c,stop1s)
            nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=False)            
            alphamat = np.matrix([np.cos(nangle), np.sin(nangle)])
            opt_args['_stop0c'] = stop0c
            opt_args['_stop0s'] = stop0s
            opt_args['_stop1c'] = stop1c
            opt_args['_stop1s'] = stop1s
            opt_args['optmax'] = optmax
            opt_args['optrange'] = self.optrange
            opt_args['square_pi'] = self.square_pi
            if optmax != 1:
                alphamat *= optmax
        
            if p>2:
                anglestart = min(opt_args['_stop0c'],opt_args['_stop0s'])
                anglestop = min(opt_args['_stop1c'],opt_args['_stop1s'])
                nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=True)
                alphamat2 = np.matrix([np.cos(nangle), np.sin(nangle)])
                if optmax != 1:
                    alphamat2 *= opt_args['optmax']
                
            # Arguments for grid plane
            opt_args['alphamat'] = alphamat,
            opt_args['ndir'] = self.optimizer_options['ndir'],
            opt_args['maxiter'] = self.optimizer_options['maxiter']
            if type(opt_args['ndir'] is tuple): 
                opt_args['ndir'] = opt_args['ndir'][0]
            
            # Arguments for grid plane #2
            grid_args_2 = { 
                     'alpha': self.alpha,
                     'alphamat': alphamat2,
                     'ndir': self.optimizer_options['ndir'],
                     'trimming': self.trimming,
                     'biascorr': biascorr, 
                     'dmetric' : 'euclidean',
                     '_stop0c' : stop0c,
                     '_stop0s' : stop0s,
                     '_stop1c' : stop1c,
                     '_stop1s' : stop1s,
                     'optmax' : optmax,
                     'optrange' : self.optrange,
                     'square_pi' : self.square_pi
                     }
            if flag=='two-block':
                grid_args_2['y'] = f
        
        if flag=='two-block':
            opt_args['y'] = f
            

        # Itertive coefficient estimation
        for i in range(0,h):

            if self.optimizer=='grid':
                if p==2:
                    wi,maximo = gridplane(E,self.most,
                                          pi_arguments=opt_args
                                          )
           
                elif p>2:
                
                    afin = np.zeros((p,1)) # final parameters for linear combinations
                    Z = copy.deepcopy(E)
                    # sort variables according to criterion
                    meas = [self.most.fit(E[:,k],
                            **opt_args) 
                            for k in np.arange(0,p)]
                    if self.square_pi:
                        meas = np.square(meas)
                    wi,maximo = gridplane(Z[:,0:2],self.most,opt_args)
                    Zopt = Z[:,0:2]*wi 
                    afin[0:2]=wi
                    for j in np.arange(2,p):
                        projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1),
                                         np.array(Z[:,j]).reshape(-1)]).T
                        wi,maximo = gridplane(projmat,self.most,
                                              opt_args
                                              )
                        Zopt = Zopt*float(wi[0]) + Z[:,j]*float(wi[1])
                        afin[0:(j+1)] = afin[0:(j+1)]*float(wi[0])
                        afin[j] = float(wi[1])

                    tj = Z*afin
                    objf = self.most.fit(tj,
                                     **{**fit_arguments,**opt_args}
                                    )
                    if self.square_pi:
                        objf *= objf
    

                    # outer loop to run until convergence
                    objfold = copy.deepcopy(objf)
                    objf = -1000
                    afinbest = afin
                    ii = 0
                    maxiter_2j = 2**round(np.log2(self.optimizer_options['maxiter'])) 
                
                    while ((ii < self.optimizer_options['maxiter'] + 1) and (abs(objfold - objf)/abs(objf) > 1e-4)):
                        for j in np.arange(0,p):
                            projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1),
                                         np.array(Z[:,j]).reshape(-1)]).T
                            if j > 16:
                                divv = maxiter_2j
                            else:
                                divv = min(2**j,maxiter_2j)
                        
                            wi,maximo = gridplane_2(projmat,
                                                    self.most,
                                                    q=afin[j],
                                                    div=divv,
                                                    pi_arguments=grid_args_2
                                                    )
                            Zopt = Zopt*float(wi[0,0]) + Z[:,j]*float(wi[1,0])
                            afin *= float(wi[0,0])
                            afin[j] += float(wi[1,0])
                        
                        # % evaluate the objective function:
                        tj = Z*afin
                    
                        objfold = copy.deepcopy(objf)
                        objf = self.most.fit(tj,
                                         q=afin,
                                         **opt_args
                                         )
                        if self.square_pi:
                            objf *= objf
                    
                        if  objf!=objfold:
                            if self.constraint == 'norm':
                                afinbest = afin/np.sqrt(np.sum(np.square(afin)))
                            else:
                                afinbest = afin
                            
                        ii +=1
                        if self.verbose:
                            print(str(ii))
                    #endwhile
                
                    afinbest = afin
                    wi = np.zeros((p,1))
                    wi = afinbest
                    Maxobjf[i] = objf
                # endif;%if p>2;
            else: # do not optimize by the grid algorithm
                if self.trimming > 0: 
                    warnings.warn('Optimization that involves a trimmed objective is not a quadratic program. The scipy-optimize result will be off!!')
                if 'center' in self.pi_arguments:
                    if (self.pi_arguments['center']=='median'): 
                        warnings.warn('Optimization that involves a median in the objective is not a quadratic program. The scipy-optimize result will be off!!')   
                constraint = {'type':'eq',
                              'fun': lambda x: np.linalg.norm(x) -1,
                              }
                if len(self.optimizer_constraints)>0: 
                    constraint = [constraint,self.optimizer_constraints]
                wi = minimize(pp_objective,
                              E[0,:].transpose(),
                              args=(self.most,E,opt_args),
                              method=self.optimizer,
                              constraints=constraint,
                              options=self.optimizer_options).x
                wi = np.matrix(wi).reshape((p,1))
                wi /= np.sqrt(np.sum(np.square(wi)))
                
                
            # Computing projection weights and scores
            ti = E*wi
            if self.optimizer != 'grid':
                Maxobjf[i] = self.most.fit(E*wi,**opt_args)
            nti = np.linalg.norm(ti)
            pi = E.T*ti / (nti**2)
            if self.whiten_data:
                wi /= np.sqrt((wi**2).sum())
                wi = K*wi
            wi0 = wi
            wi = np.array(wi)
            if len(W[:,i].shape) == 1:
                wi = wi.reshape(-1)
            W[:,i] = wi
            T[:,i] = np.array(ti).reshape(-1)
            P[:,i] = np.array(pi).reshape(-1)
            
            if flag != 'one-block':
                criteval = self.most.fit(E*wi0,
                                         **opt_args
                                         )
                if self.square_pi:
                    criteval *= criteval
                    
                assovec[i] = criteval
                

            # Deflation of the datamatrix guaranteeing orthogonality restrictions
            E -= ti*pi.T
 
            # Calculate R-Weights
            R = np.dot(W[:,0:(i+1)],pinv2(np.dot(P[:,0:(i+1)].T,W[:,0:(i+1)]),check_finite=False))
        
            # Execute regression y~T if y is present. Generate regression estimates.
            if flag != 'one-block':
                if self.regopt=='OLS':
                    ci = np.dot(ti.T,ys)/(nti**2)
                elif self.regopt == 'robust':
                    linfit = rm(fun=fun,probp1=probp1,probp2=probp2,probp3=probp3,
                                centre=self.center,scale=scale,
                                start_cutoff_mode='specific',verbose=self.verbose)
                    linfit.fit(ti,ys)
                    ci = linfit.coef_
                elif self.regopt == 'quantile':
                    linfit = QuantReg(y,ti)
                    model = linfit.fit(q=quantile)
                    ci = model.params
                # end regression if
                
                C[i] = ci
                bi = np.dot(R,C[0:(i+1)])
                bi_scaled = bi
                bi = np.multiply(np.reshape(sy/sX,(p,1)),bi)
                B[:,i] = bi[:,0]
                B_scaled[:,i] = bi_scaled[:,0]

        # endfor; Loop for latent dimensions

        # Re-adjust estimates to original dimensions if data have been compressed 
        if dimensions:
            B = np.matmul(V[:,0:p],B)
            B_scaled = np.matmul(V[:,0:p],B_scaled)
            R = np.matmul(V[:,0:p],R)
            W = np.matmul(V[:,0:p],W)
            P = np.matmul(V[:,0:p],P)
            bi = B[:,h-1]
            if self.center_data:
                Xs = centring.fit_transform(X0)
                mX = centring.col_loc_
                sX = centring.col_sca_
            else:
                Xs = X0
                mX = np.zeros((1,p))
                sX = np.ones((1,p))
        
        bi = bi.astype("float64")
        if flag != 'one-block':            
            # Calculate scaled and unscaled intercepts
            if dimensions:
                X = convert_X_input(X0)
            if(self.center == "mean"):
                intercept = sps.trim_mean(y - np.matmul(X,bi),trimming)
            else:
                intercept = np.median(np.reshape(y - np.matmul(X,bi),(-1)))
            yfit = np.matmul(X,bi) + intercept
            if not(scale == 'None'):
                if (self.center == "mean"):
                    b0 = np.mean(ys - np.matmul(Xs.astype("float64"),bi))
                else:
                    b0 = np.median(np.array(ys.astype("float64") - np.matmul(Xs.astype("float64"),bi)))
            else:
                b0 = intercept
            
            # Calculate fit values and residuals 
            yfit = yfit    
            r = y - yfit
            setattr(self,"coef_",B)
            setattr(self,"intercept_",intercept)
            setattr(self,"coef_scaled_",B_scaled)
            setattr(self,"intercept_scaled_",b0)
            setattr(self,"residuals_",r)
            setattr(self,"fitted_",yfit)
            setattr(self,"y_loadings_",C)
            setattr(self,"y_loc_",my)
            setattr(self,"y_sca_",sy)
                
        setattr(self,"x_weights_",W)
        setattr(self,"x_loadings_",P)
        setattr(self,"x_rotations_",R)
        setattr(self,"x_scores_",T)
        setattr(self,"x_ev_",Xev)
        setattr(self,"crit_values_",assovec)
        setattr(self,"Maxobjf_",Maxobjf)
        
        if self.whiten_data:
            setattr(self,"whitening_",K)

        
        if mixing:
            setattr(self,"mixing_",np.linalg.pinv(W))
        
        
        setattr(self,"x_loc_",mX)
        setattr(self,"x_sca_",sX)

        setattr(self,'scaling',scale)
        if self.return_scaling_object:
            setattr(self,'scaling_object_',centring)
        
        return(self)   
示例#18
0
def QuantileRegression(X, Y, quantile):
    mod = QuantReg(Y, X)
    res = mod.fit(q=quantile)
    return res.params
示例#19
0
文件: knee.py 项目: josepm/capacity
def find_knee(X, Y, q=0.75, conf_level=0.999, q_init=0.5, n_knees=1):
    """
    Finds the knee of the XY curve (i.e. where Y shoots up in '"non-linear" fashion with respect to X)
    Assumes that Y is noisily increasing with X.
    The choice of q_init, q and conf_level reflects the subjectivity of the problem.
    - larger q_init will detect knees 'later' (i.e. for higher values of X or miss them altogether)
    - larger conf_level will detect knees 'later'
    - larger q will detect knees 'earlier'
    Example (M/M/1):
    X = np.random.uniform(low=0, high=1, size=100)
    Y = np.maximum(0, 1.0 / (1-X) + np.random.normal(0, 1, size=100))
    plt.scatter(X, Y)
    find_knee(X, Y, q=0.5, conf_level=0.999, q_init = 0.5)
    find_knee(X, Y, q=0.25, conf_level=0.999, q_init = 0.5)
    find_knee(X, Y, q=0.75, conf_level=0.999, q_init = 0.5)

    :param X: independent values (n x 1 list or np array)
    :param Y: dependent values (n x 1 list or np array)
    :param q: knee quantile level. The lower q, the less sensitive to knee detection, i.e. the knee, if any, will be detected at higher values of X.
    :param q_init: the percentile value where we start looking for the knee, e.g. if q_init = 0.5, we look for knees past the median of X.
    :param conf_level: knee detection confidence level. Set very high if we want knee certainty.
    :param n_knees: number of knees to detect
    :param knee_list: knee_list output
    :return: knee list
    """

    if len(X) != len(Y):
        print 'invalid input lengths. X: ' + str(len(X)) + ' Y: ' + str(len(Y))
        sys.exit(0)

    check_prob(q, 'q')
    check_prob(q_init, 'q_init')
    check_prob(conf_level, 'conf_level')
    if not(isinstance(n_knees, int)) or n_knees < 0:
        print 'invalid n_knees: ' + str(n_knees)
        sys.exit(0)

    # close recursion
    if n_knees == 0:
        return []

    # sort by increasing X and add 1's for the intercept
    x0 = np.ones(len(X))  # add 1's for intercept
    Z = zip(x0, X, Y)
    Z.sort(key=itemgetter(1))

    init_cnt = int(q_init * len(Z))
    Z_q, Z_k = Z[:init_cnt], Z[init_cnt:]
    X_q, Y_q = np.array([z[:-1] for z in Z_q]), np.array([z[-1] for z in Z_q])
    q_reg_obj = QuantReg(endog=Y_q, exog=X_q)
    mdl = q_reg_obj.fit(q=q)
    ones, X_k, Y_k = zip(*Z_k)             # already sorted!
    Y_preds = mdl.predict(zip(ones, X_k))  # predict all values from q-itle onwards
    signs = np.sign(Y_k - Y_preds)         # 1 if positive, -1 if negative, 0 if equal
    upr = np.maximum(0, signs)
    cum_upr = int((1.0 - q) * init_cnt) + np.cumsum(upr)  # cum_upr: count of points over regression line
    ttl_cnt = range(init_cnt, len(Z))                     # total running count
    rv = sp.binom(n=ttl_cnt, p=1.0 - q)
    diffs = 1.0 - conf_level - rv.sf(x=cum_upr - 1)
    knee_idx = find_ge_idx(diffs, 0.0)                    # knee: the first time we have binom_test(p_val) < 1-conf_level
    x_knee = X_k[knee_idx] if knee_idx < len(X_k) else None
    if x_knee is not None:
        if n_knees > 1:
            Z_n = [zn for zn in Z_k if zn[1] >= x_knee]
            if len(Z_n) > 10:
                ones, X_n, Y_n = zip(*Z_n)
                return [x_knee] + find_knee(X_n, Y_n, q=q, conf_level=conf_level, q_init=q_init, n_knees=n_knees - 1)
            else:
                return [x_knee]
        else:
            return [x_knee]
    else:
        return []
示例#20
0
文件: dqr_spark.py 项目: feng-li/dqr
                pdf=XY_pilot_pdf_i,
                onehot_column='features_ONEHOT',
                onehot_column_names=onehot_column_names,
                onehot_column_is_sparse=False)
        else:
            onehot_column_names = []

        column_names_x_full = X_names + onehot_column_names

        # statsmodels.quantile_regression is picky about covariates. 1. All covariates
        # must be float, and int dummies are not allowed. 2. Multicolineared covariates
        # will give error.
        dqr_pilot = QuantReg(
            endog=XY_pilot_pdf_i[Y_name],
            exog=(XY_pilot_pdf_i[column_names_x_full]).astype(float))
        dqr_pilot_res = dqr_pilot.fit(q=dqr_conf['quantile'])

        # dqr_pilot = QuantReg(endog=XY_pilot_pdf_i[Y_name],
        #                      exog=(XY_pilot_pdf_i[column_names_x_full[:21] + column_names_x_full[24:]]).astype(float))
        # dqr_pilot_res = dqr_pilot.fit(q=dqr_conf['quantile'])

        dqr_pilot_par = {
            'bandwidth': dqr_pilot_res.bandwidth,
            'params': dqr_pilot_res.params
        }

        # Step 2: Updating QR components
        tic_repartition = time.perf_counter()
        XY_sdf_i = XY_sdf_i.repartition(partition_num_sub[file_no_i],
                                        "partition_id")
        time_repartition_sub.append(time.perf_counter() - tic_repartition)
evals = [(dtrain, 'train'), (dvalid_xy, 'eval')]
model = xgb.train(xgb_params,
                  dtrain,
                  num_boost_round=num_boost_rounds,
                  evals=evals,
                  early_stopping_rounds=early_stopping_rounds,
                  verbose_eval=10)
valid_pred = model.predict(dvalid_x, ntree_limit=model.best_ntree_limit)
print("XGBoost validation set predictions:")
print(pd.DataFrame(valid_pred).head())
print("\nMean absolute validation error:")
mean_absolute_error(y_valid, valid_pred)

if OPTIMIZE_FUDGE_FACTOR:
    mod = QuantReg(y_valid, valid_pred)
    res = mod.fit(q=.5)
    print("\nLAD Fit for Fudge Factor:")
    print(res.summary())

    fudge = res.params[0]
    print("Optimized fudge factor:", fudge)
    print("\nMean absolute validation error with optimized fudge factor: ")
    print(mean_absolute_error(y_valid, fudge * valid_pred))

    fudge **= FUDGE_FACTOR_SCALEDOWN
    print("Scaled down fudge factor:", fudge)
    print("\nMean absolute validation error with scaled down fudge factor: ")
    print(mean_absolute_error(y_valid, fudge * valid_pred))
else:
    fudge = 1.0
示例#22
0
文件: functions.py 项目: RobeeF/MCMB
def MCMB(Y, X, tau, size=50, extension=None, alpha=0.05, verbose=False, return_chain=False, sample_spacing=1, parallelize_mode='seq'):
    '''
    MCMB algorithm
    Y: dependant variable 1-d numpy.ndarray
    X: Covariates (n,p) numpy.ndarray
    max-iter: length of the Markov Chain to generate
    extension: Which extension of the MCMB algorithm to use: A or None
    alpha: degree of confidence for which the intervals are returned
    seed: Seed used to have reproductible results
    verbose: Set to True to display the computation details. Only one level of verbose
    sample_spacing: the frequency at which the betas are sampled: a large sample_spacing prevents from autocorrelations
    parallelize_mode: Type of parallelization the computation: p for parallel (all the betas are updated in parallel), bp for block parallel
        (n_jobs parallel betas are updated simultaneously), seq: as in Kocherginsky & al. the betas are updated sequentially.
    -----------------------------------
    returns (tuple): the initial estimate of the Betas and the CIs computed if return_chain==False the beta chain otherwise
    '''
    n_cores = multiprocessing.cpu_count()

    if extension=='A':
        A = compute_A(X)
        X = np.dot(X, A) # Normalisation
    
    # Estimation of beta_hat
    mod = QuantReg(Y, X)
    res = mod.fit(q=tau, max_iter=7000)
    beta_hat = res.params
    
    
    #Initialisation of parameters
    p = len(beta_hat)
    beta = beta_hat.copy()
    Beta = []
    i = 0

    Z = X_to_Z(X, Y, beta_hat, tau)
    
    remaining_iter = size*sample_spacing

    while remaining_iter>0:
        if parallelize_mode=='seq': # Same updating than in Kocherginsky & al.
            for j in range(p):            
                beta_j =  weighted_quantile(X, Y, Z, beta, j, tau)
                beta = np.concatenate((beta[:j],[beta_j],beta[j+1:]))
                
        else: # n_cores betas_j are updated at each iteration of the loop 
            beta = beta_update_numba(p, beta, X, Y, Z, tau, n_cores)
            
        # Each sample_spacing iterations, we sample the betas
        if remaining_iter%sample_spacing == 0:
            Beta.append(copy.deepcopy(beta))
        
        i +=1
        remaining_iter-=1
        if verbose:
            print('Iteration ' + str(i) + ' reussie !')


    Beta = [np.dot(np.array(Beta[i]),A).tolist() for i in range(len(Beta))] if extension=='A' else Beta
    beta_hat= np.dot(beta_hat,A) if extension=='A' else beta_hat
    
    # Covariance matrix
    Sigma = np.cov(np.array(Beta), rowvar=False)
    # Compute the Confidence Intervals
    CI =[]
    CI = [[beta_hat[i]-scipy.stats.norm.ppf(1-(alpha/2))*np.sqrt(Sigma[i,i]), 
       beta_hat[i]+scipy.stats.norm.ppf(1-(alpha/2))*np.sqrt(Sigma[i,i])] for i in range(p)]
    return Beta if return_chain else (beta_hat, CI) 
示例#23
0
    max_right_q = 0

    rsqs = []
    qs = []
    util = []

    values = {}
    for name in explanatory.columns:
        values[name] = np.zeros(10)
        i = 0
        for q in np.linspace(0.05, 0.95, 5):
            values[name][i] = 0
            i += 1

    for i, q in enumerate(np.linspace(0.05, 0.95, 5)):
        fitted = model.fit(q=q)
        adjr2 = fitted.prsquared
        qs.append(q)
        rsqs.append(adjr2)

        for name in fitted.params[fitted.pvalues < 0.05].index:
            if fitted.params[name] != 0:
                print(first_min_second[0] + "_" + first_min_second[1] + "\t" +
                      name + "\t" + str(fitted.params[name]) + "\t" + str(q) +
                      "\t" + str(fitted.pvalues[name]))
                values[name][i] = fitted.params[name]

        util.append(sum(fitted.pvalues < 0.05))

        if q > 0.5 and adjr2 > max_right:
            max_right = adjr2
示例#24
0
#for f1, f2 in itertools.combinations(orig.columns.copy(), 2):
#    prod = orig[f1].values * orig[f2].values
#    orig[f1 + '_times_' + f2] = prod

orig['is_catole'] = np.array(df['bairro'] == 'catole', dtype='d')
orig['is_centro'] = np.array(df['bairro'] == 'centro', dtype='d')
orig['is_liberdade'] = np.array(df['bairro'] == 'liberdade', dtype='d')

scaled = pd.DataFrame(StandardScaler().fit_transform(orig.copy().values),
                      columns=orig.columns)
print(orig.shape)
assert orig.shape == scaled.shape

# In[5]:

model = QuantReg(response, orig)

# In[6]:

for q in np.linspace(0.05, 0.95, 10):
    print(q)
    print(model.fit(q=q).summary())
    print()
    print()

# In[ ]:

# In[ ]:

# In[ ]:
示例#25
0
    xmin = [-1., -1.]
    xmax = [2., 3.]
    mu, invSig = ConstructRBF(xmin, xmax, [3, 3])

    t0 = time.time()
    data_x, data_f = GenerateSample(xmin,
                                    xmax,
                                    N_sample=300,
                                    Func=Func,
                                    NoiseFunc=NoiseFunc)
    print 'GenerateSample/Computation time:', time.time() - t0

    t0 = time.time()
    Theta = np.array([FeaturesNG(x, mu, invSig) for x in data_x])
    quant_reg = QuantReg(data_f, Theta)
    fit1 = quant_reg.fit(q=0.1)
    fit5 = quant_reg.fit(q=0.5)
    fit9 = quant_reg.fit(q=0.95)
    w1 = fit1.params
    w5 = fit5.params
    w9 = fit9.params
    print fit9.summary()
    print 'Parameters w1:', w1
    print 'Parameters w5:', w5
    print 'Parameters w9:', w9
    print 'QuantReg/Computation time:', time.time() - t0

    fp = file('/tmp/data.dat', 'w')
    for x, f in zip(data_x, data_f):
        fp.write('%f %f %f\n' % (x[0], x[1], f))
示例#26
0
    max_right_q = 0
    
    rsqs = []
    qs = []
    util = []
    
    values = {}
    for name in explanatory.columns:
        values[name] = np.zeros(10)
        i = 0
        for q in np.linspace(0.05, 0.95, 5):
            values[name][i] = 0
            i += 1
            
    for i, q in enumerate(np.linspace(0.05, 0.95, 5)):
        fitted = model.fit(q=q)
        adjr2 = fitted.prsquared
        qs.append(q)
        rsqs.append(adjr2)
        
    	for name in fitted.params[fitted.pvalues < 0.05].index:
            if fitted.params[name] != 0:
		print(first_min_second[0]+"_"+first_min_second[1]+"\t"+name+"\t"+str(fitted.params[name])
                      +"\t"+str(q)+"\t"+str(fitted.pvalues[name]))
                values[name][i] = fitted.params[name]
            
        util.append(sum(fitted.pvalues < 0.05))
        
        if q > 0.5 and adjr2 > max_right:
            max_right = adjr2
            max_right_q = q