Exemplo n.º 1
0
 def fit_transform(self, X, y=None):
     if len(X.shape) > 1:
         t = [stats.boxcox(X[:, col]) for col in range(X.shape[1])]
         xt, self.lambdas = zip(*t)
         return np.array(xt).T
     else:
         xt, self.lambdas = stats.boxcox(X)
         return np.array(xt)
Exemplo n.º 2
0
 def transform(self, X):
     X += self.shift
     if isinstance(self.lmbda, float):
         xb = boxcox(X, self.lmbda)
     else:
         xb = numpy.zeros(shape=X.shape)
         for j, lmb in enumerate(self.lmbda):
             xb[:, j] = boxcox(X[:, j], lmb)
     return (xb - self.xmean) / self.xstd
Exemplo n.º 3
0
 def better_loglikelihood(self,param_e):
     if self.result_producing_thing.typ=="emulator":
         self.result_producing_thing.emulate(param_e[0:-2])
     if self.result_producing_thing.typ=="swmm":
         self.result_producing_thing.run(param_e[0:-2])
     data=stats.boxcox((self.measurement>0)*self.measurement+0.01,0.35)
     mean=stats.boxcox((self.result_producing_thing.result>0)*self.result_producing_thing.result+0.01,0.35)
     covariance=param_e[-1]*self.cov_mat_b_base+\
         self.cov_mat_e_base*param_e[-2]
     lik=-0.5*np.linalg.slogdet(covariance)[1]-\
         0.5*np.dot(mean-data,np.linalg.solve(covariance,mean-data))-\
         0.5*self.t*np.log(2*np.pi)
     return lik
Exemplo n.º 4
0
    def test_fixed_lmbda(self):
        np.random.seed(12345)
        x = stats.loggamma.rvs(5, size=50) + 5
        xt = stats.boxcox(x, lmbda=1)
        assert_allclose(xt, x - 1)
        xt = stats.boxcox(x, lmbda=-1)
        assert_allclose(xt, 1 - 1/x)

        xt = stats.boxcox(x, lmbda=0)
        assert_allclose(xt, np.log(x))

        # Also test that array_like input works
        xt = stats.boxcox(list(x), lmbda=0)
        assert_allclose(xt, np.log(x))
Exemplo n.º 5
0
def executeOneSetting(tensor, density, roundId, para):
    logger.info('density=%.2f, %2d-round starts.'%(density, roundId + 1))
    (numUser, numService, numTime) = tensor.shape
    dim = para['dimension']

    # initialization
    U = np.random.rand(numUser, dim)
    S = np.random.rand(numService, dim)
    p = np.zeros(numUser)
    q = np.zeros(numService)

    # run for each time slice
    for sliceId in xrange(numTime):
        # boxcox data transformation
        matrix = tensor[:, :, sliceId]
        dataVector = matrix[:]
        (transfVector, alpha) = stats.boxcox(dataVector[dataVector > 0])
        maxV = np.max(transfVector)
        minV = np.min(transfVector)
        transfMatrix = matrix.copy()
        transfMatrix[transfMatrix != -1] = stats.boxcox(transfMatrix[transfMatrix != -1], alpha)
        transfMatrix[transfMatrix != -1] = (transfMatrix[transfMatrix != -1] - minV) / (maxV - minV)

        # remove data entries to generate trainMatrix and testMatrix  
        seedID = roundId + sliceId * 100
        (trainMatrix, testMatrix) = evallib.removeEntries(matrix, density, seedID)
        trainMatrix = np.where(trainMatrix > 0, transfMatrix, 0)
        (testVecX, testVecY) = np.where(testMatrix)     
        testVec = matrix[testVecX, testVecY]

        # invocation to the prediction function
        startTime = time.clock() 
        predictedMatrix = AMF.predict(trainMatrix, U, S, p, q, para)     
        runningTime = float(time.clock() - startTime)

        # evaluate the estimation error  
        predVec = predictedMatrix[testVecX, testVecY]
        predVec = (maxV - minV) * predVec + minV
        predVec = evallib.argBoxcox(predVec, alpha)
        evalResult = evallib.errMetric(testVec, predVec, para['metrics'])
        result = (evalResult, runningTime)

        # dump the result at each density
        outFile = '%s%s_%s_result_%02d_%.2f_round%02d.tmp'%(para['outPath'], 
            para['dataName'], para['dataType'], sliceId + 1, density, roundId + 1)
        evallib.dumpresult(outFile, result)
        logger.info('sliceId=%02d done.'%(sliceId + 1))
        
    logger.info('density=%.2f, %2d-round done.'%(density, roundId + 1))
    logger.info('----------------------------------------------')
Exemplo n.º 6
0
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False,
               ic="aic", stepwise=True, trace=False, approximation=None,
               test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs):
        # Parameter Validity Check
    if np.any(np.isnan(endog)):
        raise ValueError("Missing Values in Series")
    origin_endog = endog
    if _is_using_pandas(endog, None):
        endog = np.asarray(endog)
    if len(endog) <= 10:
        raise ValueError("There are too few observations.")
    if np.any(np.isnan(endog)):
        raise ValueError("NaN values in endogenous not allowed")
    if np.all(endog == endog[0]):
        raise ValueError("The endogenous variable is a constant")
    if (not isinstance(freq, int)) or freq <= 1:
        raise ValueError("The frequency parameter must be a integer greater than 1")
    if lambda_parameter is not None:
        if lambda_parameter < 0:
            raise ValueError("The Lambda parameter must be positive")
        if not np.all(endog > 0):
            raise ValueError("Box-Cox Transformation can be only used on positive series.")
        endog = boxcox(endog, lambda_parameter)

    max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    if stationary:
        D = 0
        d = 0
    if freq == 1:
Exemplo n.º 7
0
    def test_alpha(self):
        np.random.seed(1234)
        x = stats.loggamma.rvs(5, size=50) + 5

        # Some regular values for alpha, on a small sample size
        _, _, interval = stats.boxcox(x, alpha=0.75)
        assert_allclose(interval, [4.004485780226041, 5.138756355035744])
        _, _, interval = stats.boxcox(x, alpha=0.05)
        assert_allclose(interval, [1.2138178554857557, 8.209033272375663])

        # Try some extreme values, see we don't hit the N=500 limit
        x = stats.loggamma.rvs(7, size=500) + 15
        _, _, interval = stats.boxcox(x, alpha=0.001)
        assert_allclose(interval, [0.3988867, 11.40553131])
        _, _, interval = stats.boxcox(x, alpha=0.999)
        assert_allclose(interval, [5.83316246, 5.83735292])
Exemplo n.º 8
0
    def test_mle(self):
        maxlog = stats.boxcox_normmax(self.x, method='mle')
        assert_allclose(maxlog, 1.758101, rtol=1e-6)

        # Check that boxcox() uses 'mle'
        _, maxlog_boxcox = stats.boxcox(self.x)
        assert_allclose(maxlog_boxcox, maxlog)
def readIn_PredictionData(fn,dfmax,dfmin,transformationFunction):
    df = pd.read_csv(fn,sep=",",header=False)

#    countij = 0
#    for i in range(0,len(df.columns)):
#        for j in range(i+1,min(i+5,len(df.columns))):
#            countij = countij+1
#            df['new'+str(countij)] = np.multiply(df[df.columns[i]],df[df.columns[j]])
        
    print len(df.columns)
    for i in range(0,len(df.columns)):
#        if df.columns[i] != "selection":
        if transformationFunction == "bin":
            df[df.columns[i]] =[(x if x < 31 else 50 ) for x in df[df.columns[i]]]
        elif transformationFunction == "binlog":
            df[df.columns[i]] =[(0.5 if x==0 else (x if x < 31 else 50) ) for x in df[df.columns[i]]]
            df[df.columns[i]] = np.log(df[df.columns[i]])
        elif transformationFunction == "log":
            df[df.columns[i]] =[(0.5 if x==0 else x) for x in df[df.columns[i]]]
            df[df.columns[i]] = np.log(df[df.columns[i]])
        elif transformationFunction == "sqrt":
            df[df.columns[i]] = np.sqrt(df[df.columns[i]])
        elif transformationFunction == "boxcox":
            df[df.columns[i]] = stats.boxcox(np.array(df[df.columns[i]]))[0]
        df[df.columns[i]] = normalize_predictioninput(np.array(df[df.columns[i]]),dfmax[i],dfmin[i])
    return df
def sgs(data, xsteps=10, ysteps=10,
        nugget_dist=10, x_col='x_m', y_col='y_m', flux_col='flux',
        transform_data=True, invert_transform=True):
    x = data.x_m.values
    y = data.y_m.values
    flux = data.flux.values
    if transform_data:
        flux, L =  scpstats.boxcox(flux)
    data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col])
    new_x = []
    new_y = []
    new_flux = []
    # create array for the output
    idx, grid, indexGrid, M = makePathAndGrid(data, xsteps, ysteps)
    for step in idx :
        point = [grid[0][step], grid[1][step]]
        model = kriging.krig_model(data, nugget_dist, x_col, y_col, flux_col)
        est = kriging.krig_sample(model, point)
        indexPoint = [indexGrid[0][step], indexGrid[1][step]]
        M[indexPoint[0], indexPoint[1]] = est
        x = np.r_[x, point[0]]
        new_x.append(x[-1])
        y = np.r_[y, point[1]]
        new_y.append(y[-1])
        flux = np.r_[flux, est]
        new_flux.append(flux[-1])
        data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col])

    if invert_transform and transform_data:
        M = invboxcox(M, L)
        new_flux = invboxcox(np.array(new_flux), L)

    return grid[0,:].reshape(M.shape), grid[1,:].reshape(M.shape), M, new_x, new_y, new_flux
 def transform(self, x):
     x = np.asarray(x)
     if self.method == 'lambert':
         return np.array([self.w_t(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T
     elif self.method == 'boxcox':
         return np.array([boxcox(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T
     else:
         raise NotImplementedError
Exemplo n.º 12
0
def processing(data):
    #构造新特征
    create_feature(data);
    #丢弃特征
    data.drop(to_drop,axis=1,inplace=True)
    
    #填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充
    fill_none = ['MasVnrType','BsmtExposure','GarageType','MiscFeature']
    for col in fill_none:
        data[col].fillna('None',inplace=True);
        
    #对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数
    na_col = data.dtypes[data.isnull().any()];
    for col in na_col.index:
        if na_col[col] != 'object':
            med = data[col].median();
            data[col].fillna(med,inplace=True);
        else:
            mode = data[col].mode()[0];
            data[col].fillna(mode,inplace=True);
    
    #对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征
    #因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换
    numeric_col = data.skew().index;
    zero_col = data.columns[data.isin([0]).any()]
    for col in numeric_col:
        #对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大
        if len(pd.value_counts(data[col])) <= 10 : continue; 
        #如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整
        if col in zero_col:       
            trans_data = data[data>0][col];
            before = abs(trans_data.skew());
            cox,_ = boxcox(trans_data)
            log_after = abs(Series(cox).skew());
            if log_after < before:
                data.loc[trans_data.index,col] = cox;
        #如果是非零值的特征,则全部作转换
        else:
            before = abs(data[col].skew());
            cox,_ = boxcox(data[col])
            log_after = abs(Series(cox).skew());
            if log_after < before:
                data.loc[:,col] = cox;
    #mapper值的映射转换
    for col,mapp in mapper.items():
        data.loc[:,col] = data[col].map(mapp);
def boxcoxtransform(dataframe, numeric_feats):
    lam=defaultdict(float)
    skewed_feats = dataframe[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        dataframe[feats] = dataframe[feats] + 1
        dataframe[feats], lam[feats] = boxcox(dataframe[feats])
    return dataframe, lam
Exemplo n.º 14
0
    def test_lmbda_None(self):
        np.random.seed(1234567)
        # Start from normal rv's, do inverse transform to check that
        # optimization function gets close to the right answer.
        np.random.seed(1245)
        lmbda = 2.5
        x = stats.norm.rvs(loc=10, size=50000)
        x_inv = (x * lmbda + 1)**(-lmbda)
        xt, maxlog = stats.boxcox(x_inv)

        assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
 def fit(self, x):
     x = np.asarray(x)
     if self.method == 'lambert':
         for x_i in x.T:
             self.trans_params.append(self.iterate_moments(x_i, tol=self.tol,
                                                           max_iter=self.max_iter))
     elif self.method == 'boxcox':
         for x_i in x.T:
             self.trans_params.append(boxcox(x_i)[1])
     else:
         raise NotImplementedError
Exemplo n.º 16
0
def transform_data_to_gaussian_1D(feature_vector):
    """
    Takes not-necessarily any distributed data and transforms it
    to a gaussian distribution using the box-cox transform
    """
    import matplotlib.pyplot as plt
    x=feature_vector

    n=len(x)
    new_x,l=stats.boxcox(feature_vector)
        
    return new_x
def boxcox(x,y,y_label):
    box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1)
    regr.fit(x,box_cox)
    box_cox_predict = regr.predict(x)
    y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1
    print "R squared: " + str(np.var(y_predict)/np.var(y))
    # Plot outputs
    fig = plt.figure()
    plt.scatter(y, y_predict, color='blue')
    plt.xlabel(y_label)
    plt.ylabel('predicted')
    plt.show()
def box_cox(df, lmbda=None, alpha=None):
    """
    Performs a Box-Cox Transformation on all columns (features) of a pandas
    dataframe. Currently, there is some ambiguity as to how to deal with
    non-positive values & I need to check this out: at the moment, I just centre
    the data so that min(value) > 0, for all features, as necessitated by
    the very nature of the Box-Cox Transformation.
    """
    df_tr = pd.DataFrame(columns=df.columns)  #initialize empty data frame with same features as df
    for val in list(df.columns):
        df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data
    return df_tr
Exemplo n.º 19
0
 def fit(self, X):
     xtrans = numpy.zeros(shape=X.shape)
     if len(X.shape) == 2:
         self.shift = -X.min(axis=0)
         self.shift[self.shift < 0] = 0
         self.shift += 3 * X.std(axis=0)
         X += self.shift
         self.lmbda = numpy.zeros(X.shape[1])
         for j in range(X.shape[1]):
             _, self.lmbda[j] = boxcox(X[:, j])
             self.lmbda[j] = max(self.lmbda[j], self.minlmbda)
             self.lmbda[j] = min(self.lmbda[j], self.maxlmbda)
             if numpy.abs(self.lmbda[j]) < 1e-4:
                 self.lmbda[j] = 0
                 print "changing lambda"
             xtrans[:, j] = boxcox(X[:, j], self.lmbda[j])
     elif len(X.shape) == 1:
         self.shift = max([1e-10, -X.min()])
         self.shift += 3 * X.std()
         X += self.shift
         xtrans, self.lmbda = boxcox(X)
     self.xmean = xtrans.mean(axis=0)
     self.xstd = xtrans.std(axis=0)
Exemplo n.º 20
0
def append_boxcox(data, cols, drop_old=False):
    """Apply boxcox transformations to a list of columns
    data: a pandas DataFrame
    cols: a list of column names for which to perform boxcox transformations
    """
    if isinstance(cols, basestring):
        cols = [cols]

    for col in cols:
        # boxcox also returns maxlog, the lambda param that is choosen
        # could be used for pipelining objects
        data[col + '_boxcox'] = stats.boxcox(data[col])[0]
        if drop_old:
            data.drop(col, axis=1, inplace=True)
Exemplo n.º 21
0
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    # compute skew and do Box-Cox transformation (Tilli)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain
Exemplo n.º 22
0
def transform_features(x_train, x_test):
    """ Transform features using a boxcox transform. Remove vibrato features.
    Comptes the optimal value of lambda on the training set and applies this
    lambda to the testing set.

    Parameters
    ----------
    x_train : np.array [n_samples, n_features]
        Untransformed training features.
    x_test : np.array [n_samples, n_features]
        Untransformed testing features.

    Returns
    -------
    x_train_boxcox : np.array [n_samples, n_features_trans]
        Transformed training features.
    x_test_boxcox : np.array [n_samples, n_features_trans]
        Transformed testing features.
    """
    x_train = x_train[:, 0:6]
    x_test = x_test[:, 0:6]

    _, n_feats = x_train.shape

    x_train_boxcox = np.zeros(x_train.shape)
    lmbda_opt = np.zeros((n_feats,))

    eps = 1.0  # shift features away from zero
    for i in range(n_feats):
        x_train_boxcox[:, i], lmbda_opt[i] = boxcox(x_train[:, i] + eps)

    x_test_boxcox = np.zeros(x_test.shape)
    for i in range(n_feats):
        x_test_boxcox[:, i] = boxcox(x_test[:, i] + eps, lmbda=lmbda_opt[i])

    return x_train_boxcox, x_test_boxcox
Exemplo n.º 23
0
def boxcox(X):
    """
    Gaussianize X using the Box-Cox transformation: [samples x phenotypes]

    - each phentoype is brought to a positive schale, by first subtracting the minimum value and adding 1.
    - Then each phenotype transformed by the boxcox transformation
    """
    X_transformed = sp.zeros_like(X)
    maxlog = sp.zeros(X.shape[1])
    for i in range(X.shape[1]):
        i_nan = sp.isnan(X[:,i])
        values = X[~i_nan,i]
        X_transformed[i_nan,i] = X[i_nan,i]
        X_transformed[~i_nan,i], maxlog[i] = st.boxcox(values-values.min()+1.0)
    return X_transformed, maxlog
Exemplo n.º 24
0
 def preprocess_feature(cls, feature, parameters):
     is_not_empty = 1 - np.isclose(feature, MISSING_VALUE)
     if parameters.feature_type == identify_types.BINARY:
         # Binary features are always 1 unless they are 0
         return ((feature != 0) * is_not_empty).astype(np.float32)
     if parameters.boxcox_lambda is not None:
         feature = stats.boxcox(
             np.maximum(feature + parameters.boxcox_shift, BOX_COX_MARGIN),
             parameters.boxcox_lambda,
         )
     # No *= to ensure consistent out-of-place operation.
     if parameters.feature_type == identify_types.PROBABILITY:
         feature = np.clip(feature, 0.01, 0.99)
         feature = special.logit(feature)
     elif parameters.feature_type == identify_types.QUANTILE:
         transformed_feature = np.zeros_like(feature)
         for i in six.moves.range(feature.shape[0]):
             transformed_feature[i] = cls.value_to_quantile(
                 feature[i], parameters.quantiles
             )
         feature = transformed_feature
     elif parameters.feature_type == identify_types.ENUM:
         possible_values = parameters.possible_values
         mapping = {}
         for i, possible_value in enumerate(possible_values):
             mapping[possible_value] = i
         output_feature = np.zeros((len(feature), len(possible_values)))
         for i, val in enumerate(feature):
             if abs(val - MISSING_VALUE) < 1e-2:
                 # This check is required by the PT preprocessing but not C2
                 continue
             output_feature[i][mapping[val]] = 1.0
         return output_feature
     elif parameters.feature_type == identify_types.CONTINUOUS_ACTION:
         min_value = parameters.min_value
         max_value = parameters.max_value
         feature = (
             (feature - min_value) * ((1 - 1e-6) * 2 / (max_value - min_value))
             - 1
             + 1e-6
         )
     else:
         feature = feature - parameters.mean
         feature /= parameters.stddev
         feature = np.clip(feature, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)
     feature *= is_not_empty
     return feature
Exemplo n.º 25
0
    def transform(self, x):
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows."
        if x.shape[1] != len(self.taus):
            print "%d variables in test data, but %d variables were in training data." % (x.shape[1], len(self.taus))

        if self.strategy == 'lambert':
            return np.array([w_t(x_i, tau_i) for x_i, tau_i in zip(x.T, self.taus)]).T
        elif self.strategy == 'brute':
            return np.array([norm.ppf((rankdata(x_i) - 0.5) / len(x_i)) for x_i in x.T]).T
        elif self.strategy == 'boxcox':
            return np.array([boxcox(x_i, lmbda=lmbda_i) for x_i, lmbda_i in zip(x.T, self.taus)]).T
        else:
            raise NotImplementedError
Exemplo n.º 26
0
    def fit(self, x):
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows."

        if self.strategy == 'lambert':
            for x_i in x.T:
                self.taus.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter))
        elif self.strategy == 'brute':
            for x_i in x.T:
                self.taus.append(None)  # TODO: In principle, we could store parameters to do a quasi-invert
        elif self.strategy == 'boxcox':
            for x_i in x.T:
                self.taus.append(boxcox(x_i)[1])
        else:
            raise NotImplementedError
Exemplo n.º 27
0
    def Preprocess_TransformNumericFeatures(self, dfall, trans_type ='boxcox', correction=0.00001):

        if self.num_features is None:
            raise TypeError("Execute the SetUpTrainTest method to use this feature")
            return           
        
        if trans_type not in ['boxcox']:
            raise TypeError("Transformation type not supported")
            return            

        self.lmbdaDict = {}
        for c in self.num_features:
            print 'Applying', trans_type + 'transformation on:', c
            if trans_type == 'boxcox':
                  b = stats.boxcox(dfall[c]+ correction)
                  dfall[c] = b[0]
                  self.lmbdaDict[c]=b[1]
        
        return dfall
Exemplo n.º 28
0
def _estimate_lambda_single_y(y):
    """Estimate lambda for a single y, given a range of lambdas
    through which to search. No validation performed.
    
    Parameters
    ----------

    y : ndarray, shape (n_samples,)
       The vector being estimated against
    """

    # ensure is array
    y = np.array(y)

    # Use scipy's log-likelihood estimator
    b = boxcox(y, lmbda=None)

    # Return lambda corresponding to maximum P
    return b[1]
Exemplo n.º 29
0
    def fit(self, x, y=None):
        """Fit a Gaussianizing transformation to each variable/column in x."""
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print("Data should be a 1-d list of samples to transform or a 2d array with samples as rows.")

        if self.strategy == 'lambert':
            if self.verbose:
                print("Gaussianizing with Lambert method")
            for x_i in x.T:
                self.coefs_.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter))
        elif self.strategy == 'brute':
            for x_i in x.T:
                self.coefs_.append(None)  # TODO: In principle, we could store parameters to do a quasi-invert
        elif self.strategy == 'boxcox':
            for x_i in x.T:
                self.coefs_.append(boxcox(x_i)[1])
        else:
            raise NotImplementedError
        return self
Exemplo n.º 30
0
def boxcox_xform(X, scaling=True):
    """
    robust version of boxcox transform. Handles negative data and very large values in the original data.
    :param X: data (numeric list, Pandas series or 1d np array)
    :param scaling: whether to normalize between 0 and 1 or not
    :return: Boxcox transform array, the abs(max value of the original data set),  and the optimal lbda parameter, fp where
             fp = 'N' if the data has negative values and fp = 'P' if the data does not have negative values
    """
    x_arr = np.array(list(X))
    x_max = np.max(np.abs(x_arr)) if scaling is True else 1.0
    if len(np.unique(x_arr)) > 0:
        if np.min(x_arr) <= 0.0:    # shift and rescale
            print('use YJ transform: yj_xform(X)')
            return None
        else:                         # only positive values
            z = x_arr / x_max         # scale to deal with overflow/underflow: values in (0, 1]
            y, lbda = sps.boxcox(z, lmbda=None, alpha=None)
            # lbda = _boxcox_opt(z)
            # y = _boxcox_xform(lbda, x_arr)
            return y, x_max, lbda
    else:
        print('boxcox_xform: no data')
        return None, None, None, None
Exemplo n.º 31
0
popular_artist=df.groupby('artist_name').size()
print(popular_artist)
artist_list=df['artist_name'].values.tolist()

df.isnull().sum()
df.fillna(0)

pd.set_option('precision', 3)
df.describe()

#Finding out the skew for each attribute
skew=df.skew()
print(skew)
# Removing the skew by using the boxcox transformations
transform=np.asarray(df[['Liveness']].values)
df_transform = stats.boxcox(transform)[0]
# Plotting a histogram to show the difference 
plt.hist(df['Liveness'],bins=10) #original data
plt.show()
plt.hist(df_transform,bins=10) #corrected skew data
plt.show()

transform1=np.asarray(df[['Popularity']].values)
df_transform1 = stats.boxcox(transform1)[0]
# Plotting a histogram to show the difference 
# plt.hist(df['Popularity'],bins=10) original data
# plt.show()
# plt.hist(df_transform1,bins=10) #corrected skew data
# plt.show()
sns.distplot(df['Popularity'],bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"},color='yellow')
plt.show()
Exemplo n.º 32
0
    def fit(self,
            smoothing_level=None,
            smoothing_slope=None,
            smoothing_seasonal=None,
            damping_slope=None,
            optimized=True,
            use_boxcox=False,
            remove_bias=False,
            use_basinhopping=False,
            start_params=None,
            initial_level=None,
            initial_slope=None,
            use_brute=True):
        """
        Fit the model

        Parameters
        ----------
        smoothing_level : float, optional
            The alpha value of the simple exponential smoothing, if the value
            is set then this value will be used as the value.
        smoothing_slope :  float, optional
            The beta value of the Holt's trend method, if the value is
            set then this value will be used as the value.
        smoothing_seasonal : float, optional
            The gamma value of the holt winters seasonal method, if the value
            is set then this value will be used as the value.
        damping_slope : float, optional
            The phi value of the damped method, if the value is
            set then this value will be used as the value.
        optimized : bool, optional
            Estimate model parameters by maximizing the log-likelihood
        use_boxcox : {True, False, 'log', float}, optional
            Should the Box-Cox transform be applied to the data first? If 'log'
            then apply the log. If float then use lambda equal to float.
        remove_bias : bool, optional
            Remove bias from forecast values and fitted values by enforcing
            that the average residual is equal to zero.
        use_basinhopping : bool, optional
            Using Basin Hopping optimizer to find optimal values
        start_params: array, optional
            Starting values to used when optimizing the fit.  If not provided,
            starting values are determined using a combination of grid search
            and reasonable values based on the initial values of the data
        initial_level: float, optional
            Value to use when initializing the fitted level.
        initial_slope: float, optional
            Value to use when initializing the fitted slope.
        use_brute: bool, optional
            Search for good starting values using a brute force (grid)
            optimizer. If False, a naive set of starting values is used.

        Returns
        -------
        results : HoltWintersResults class
            See statsmodels.tsa.holtwinters.HoltWintersResults

        Notes
        -----
        This is a full implementation of the holt winters exponential smoothing
        as per [1]. This includes all the unstable methods as well as the
        stable methods. The implementation of the library covers the
        functionality of the R library as much as possible whilst still
        being Pythonic.

        References
        ----------
        [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles
            and practice. OTexts, 2014.
        """
        # Variable renames to alpha,beta, etc as this helps with following the
        # mathematical notation in general
        alpha = smoothing_level
        beta = smoothing_slope
        gamma = smoothing_seasonal
        phi = damping_slope
        l0 = self._l0 = initial_level
        b0 = self._b0 = initial_slope

        data = self.endog
        damped = self.damped
        seasoning = self.seasoning
        trending = self.trending
        trend = self.trend
        seasonal = self.seasonal
        m = self.seasonal_periods
        opt = None
        phi = phi if damped else 1.0
        if use_boxcox == 'log':
            lamda = 0.0
            y = boxcox(data, lamda)
        elif isinstance(use_boxcox, float):
            lamda = use_boxcox
            y = boxcox(data, lamda)
        elif use_boxcox:
            y, lamda = boxcox(data)
        else:
            lamda = None
            y = data.squeeze()
        if np.ndim(y) != 1:
            raise ValueError('Only 1 dimensional data supported')
        self._y = y
        lvls = np.zeros(self.nobs)
        b = np.zeros(self.nobs)
        s = np.zeros(self.nobs + m - 1)
        p = np.zeros(6 + m)
        max_seen = np.finfo(np.double).max
        l0, b0, s0 = self.initial_values()

        xi = np.zeros_like(p, dtype=np.bool)
        if optimized:
            init_alpha = alpha if alpha is not None else 0.5 / max(m, 1)
            init_beta = beta if beta is not None else 0.1 * init_alpha if trending else beta
            init_gamma = None
            init_phi = phi if phi is not None else 0.99
            # Selection of functions to optimize for appropriate parameters
            if seasoning:
                init_gamma = gamma if gamma is not None else 0.05 * \
                                                             (1 - init_alpha)
                xi = np.array([
                    alpha is None, trending and beta is None, gamma is None,
                    initial_level is None, trending and initial_slope is None,
                    phi is None and damped
                ] + [True] * m)
                func = SMOOTHERS[(seasonal, trend)]
            elif trending:
                xi = np.array([
                    alpha is None, beta is None, False, initial_level is None,
                    initial_slope is None, phi is None and damped
                ] + [False] * m)
                func = SMOOTHERS[(None, trend)]
            else:
                xi = np.array([
                    alpha is None, False, False, initial_level is None, False,
                    False
                ] + [False] * m)
                func = SMOOTHERS[(None, None)]
            p[:] = [init_alpha, init_beta, init_gamma, l0, b0, init_phi] + s0
            if np.any(xi):
                # txi [alpha, beta, gamma, l0, b0, phi, s0,..,s_(m-1)]
                # Have a quick look in the region for a good starting place for alpha etc.
                # using guesstimates for the levels
                txi = xi & np.array([True, True, True, False, False, True] +
                                    [False] * m)
                txi = txi.astype(np.bool)
                bounds = np.array([(0.0, 1.0), (0.0, 1.0), (0.0, 1.0),
                                   (0.0, None), (0.0, None), (0.0, 1.0)] + [
                                       (None, None),
                                   ] * m)
                args = (txi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs,
                        max_seen)
                if start_params is None and np.any(txi) and use_brute:
                    res = brute(func,
                                bounds[txi],
                                args,
                                Ns=20,
                                full_output=True,
                                finish=None)
                    p[txi], max_seen, _, _ = res
                else:
                    if start_params is not None:
                        start_params = np.atleast_1d(np.squeeze(start_params))
                        if len(start_params) != xi.sum():
                            raise ValueError(
                                'start_params must have {0} values but '
                                'has {1} instead'.format(
                                    len(xi), len(start_params)))
                        p[xi] = start_params
                    args = (xi.astype(np.uint8), p, y, lvls, b, s, m,
                            self.nobs, max_seen)
                    max_seen = func(np.ascontiguousarray(p[xi]), *args)
                # alpha, beta, gamma, l0, b0, phi = p[:6]
                # s0 = p[6:]
                # bounds = np.array([(0.0,1.0),(0.0,1.0),(0.0,1.0),(0.0,None),
                # (0.0,None),(0.8,1.0)] + [(None,None),]*m)
                args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs,
                        max_seen)
                if use_basinhopping:
                    # Take a deeper look in the local minimum we are in to find the best
                    # solution to parameters, maybe hop around to try escape the local
                    # minimum we may be in.
                    res = basinhopping(func,
                                       p[xi],
                                       minimizer_kwargs={
                                           'args': args,
                                           'bounds': bounds[xi]
                                       },
                                       stepsize=0.01)
                    success = res.lowest_optimization_result.success
                else:
                    # Take a deeper look in the local minimum we are in to find the best
                    # solution to parameters
                    res = minimize(func, p[xi], args=args, bounds=bounds[xi])
                    success = res.success

                if not success:
                    from warnings import warn
                    from statsmodels.tools.sm_exceptions import ConvergenceWarning
                    warn("Optimization failed to converge. Check mle_retvals.",
                         ConvergenceWarning)
                p[xi] = res.x
                opt = res
            else:
                from warnings import warn
                from statsmodels.tools.sm_exceptions import EstimationWarning
                message = "Model has no free parameters to estimate. Set " \
                          "optimized=False to suppress this warning"
                warn(message, EstimationWarning)

            [alpha, beta, gamma, l0, b0, phi] = p[:6]
            s0 = p[6:]

        hwfit = self._predict(h=0,
                              smoothing_level=alpha,
                              smoothing_slope=beta,
                              smoothing_seasonal=gamma,
                              damping_slope=phi,
                              initial_level=l0,
                              initial_slope=b0,
                              initial_seasons=s0,
                              use_boxcox=use_boxcox,
                              remove_bias=remove_bias,
                              is_optimized=xi)
        hwfit._results.mle_retvals = opt
        return hwfit
Exemplo n.º 33
0
wine = pd.read_csv(join(path_to_data_folder,
                        'monthly-australian-wine-sales.csv'),
                   ',',
                   index_col=['month'],
                   parse_dates=['month'],
                   dayfirst=True)
wine.sales = wine.sales * 1000

wine.sales.plot(title="Diki: " + str(diki(wine.sales)))
print('Diki wine: ', diki(wine.sales))

# sm.tsa.seasonal_decompose(wine.sales).plot()
# plt.show()

wine['sales_box'], lmbda = stats.boxcox(wine.sales)
wine.sales_box.plot(title="Diki: " + str(diki(wine.sales_box)))
# plt.show()

plt.ylabel(u'Transformed wine sales')
print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda)
print("Критерий Дики-Фуллера: p=%f" % diki(wine.sales_box))

wine['sales_box_diff12'] = wine.sales_box - wine.sales_box.shift(12)
wine.sales_box_diff12.dropna(inplace=True)
wine.sales_box_diff12.plot(title="sales_box_diff12, diki: " +
                           str(diki(wine.sales_box_diff12)))
sm.tsa.seasonal_decompose(wine.sales_box_diff12).plot()
# plt.show()

wine['sales_box_diff1'] = wine.sales_box_diff12 - wine.sales_box_diff12.shift(
Exemplo n.º 34
0
data_1= data_1[data_1.loan_status != 'Late (31-120 days)']
data_1 = data_1[data_1.loan_status != 'Issued']
data_1['loan_status'] = data_1['loan_status'].replace({'Charged Off':'Default'})
data_1['loan_status'] = data_1['loan_status'].replace({'In Grace Period':'Default'})
data_1.loan_status=data_1.loan_status.astype('category').cat.codes
data_1.delinq_2yrs=data_1.delinq_2yrs.astype('category').cat.codes



#print(data_1['loan_status'].unique())
#print(data_1['loan_status'].value_counts())

numerical = data_1.columns[data_1.dtypes == 'float64']
for i in numerical:
    if data_1[i].min() > 0:
        transformed, lamb = boxcox(data_1.loc[data_1[i].notnull(), i])
        if np.abs(1 - lamb) > 0.02:
            data_1.loc[data_1[i].notnull(), i] = transformed

### Spliting the data in 2 for trains and testing


data_1 = pd.get_dummies(data_1, drop_first=True)

#ros = RandomOverSampler(random_state=0)

traindata, testdata = train_test_split(data_1,stratify=data_1['loan_status'],test_size=.2)
testdata.reset_index(drop=True, inplace=True)
traindata.reset_index(drop=True, inplace=True)

sc=StandardScaler()
Exemplo n.º 35
0
      v in values_for_variable if v is not None])
    n_outliers_removed = 0

    #if remove_large_outliers:
    #  percentile_99 = np.percentile(nn_values_for_variable, 97)
    #  outlier_indices = nn_values_for_variable > percentile_99
    #  n_outliers_removed = np.count_nonzero(outlier_indices)
    #  nn_values_for_variable = nn_values_for_variable[outlier_indices == False]

    nn_values_for_variable = np.asarray(nn_values_for_variable)

    alpha = np.min(nn_values_for_variable)
    nn_values_for_variable -= alpha

    lmbda = boxcox_normmax(nn_values_for_variable + BOXCOX_A, method='mle')
    nn_values_for_variable = boxcox(nn_values_for_variable + BOXCOX_A,
                                    lmbda=lmbda)

    ind = (all_values_for_variable != None)

    all_values_for_variable[ind] = nn_values_for_variable
    #beta = np.std(nn_values_for_variable)
    #nn_values_for_variable /= beta
    #delta = np.mean(nn_values_for_variable)
    #nn_values_for_variable -= delta
    all_values_for_variables.append(all_values_for_variable)
    positive_values_for_variables.append(nn_values_for_variable)
    scaling_parameters.append([lmbda, alpha, 1, 0])

    n_unique_values = len(np.unique(nn_values_for_variable))
    n_not_missing = len(nn_values_for_variable)
    n_total_samples = len(values_for_variable)
Exemplo n.º 36
0
# plot_line(df, "date", "daily_sign_ups")
"""
box-Cox transforms are data transformations that evaluate a set of lambda coefficients (λ) 
and selects the value that achieves the best approximation of normality

the boxcox method returns a positive dataset transformed by a Box-Cox power transformation
the boxcox method has one required input: a 1-dimensional array of positive data to transform

you can also specify the λ value you’d like to use for your transformation (e.g. λ = 0 for a log transform)
otherwise, the boxcox method will find the λ that maximizes the log-likelihood function 
and will return it as the second output argument
"""

# Apply Box-Cox Transform to value column and assign to new column y
df["y"], lam = boxcox(df.daily_sign_ups)

# plot daily signups and boxcox transformation
fig = plt.figure(dpi=300, figsize=(6, 4))
ax1 = plt.subplot(311)
plt.plot(df.date, df.daily_sign_ups, "b")
plt.setp(ax1.get_xticklabels(), fontsize=6)
ax2 = plt.subplot(312, sharex=ax1)  # share x only
plt.plot(df.date, df.y, "g")
plt.setp(ax2.get_xticklabels(),
         visible=False)  # make these tick labels invisible
plt.show()

# instantiating (create an instance of) a Prophet object
m = Prophet()
                      squeeze=True)
y = validation.values.astype('float32')
# load model
model_fit = ARIMAResults.load('model.pkl')
lam = numpy.load('model_lambda.npy')
# make first prediction
predictions = list()
yhat = model_fit.forecast()[0]
yhat = boxcox_inverse(yhat, lam)
predictions.append(yhat)
history.append(y[0])
print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0]))
# rolling forecasts
for i in range(1, len(y)):
    # transform
    transformed, lam = boxcox(history)
    if lam < -5:
        transformed, lam = history, 1
    # predict
    model = ARIMA(transformed, order=(0, 1, 2))
    model_fit = model.fit(disp=0)
    yhat = model_fit.forecast()[0]
    # invert transformed prediction
    yhat = boxcox_inverse(yhat, lam)
    predictions.append(yhat)
    # observation
    obs = y[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%3.f' % (yhat, obs))
# report performance
rmse = sqrt(mean_squared_error(y, predictions))
Exemplo n.º 38
0
# Box-Cox transform
import pandas
from scipy.stats import boxcox

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
X_boxcox = boxcox(1 + X[:, 2])[0]
print(X_boxcox)
Exemplo n.º 39
0
    train_test[i]=np.log10(train_test[i]+10)

'''
##

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numeric_feats=train_test.select_dtypes(include=numerics).columns

skewed_feats = train_test[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.2]
skewed_feats = skewed_feats.index

for feats in skewed_feats:
    train_test[feats] = train_test[feats] + 1
    train_test[feats], lam = boxcox(train_test[feats])

##

train_test=train_test.drop(['pymnt_plan','verification_status_joint','application_type','title','batch_enrolled'],axis=1)

print('Label Encoding')

cat=['object']
train_test.select_dtypes(include=cat)
cat_col=train_test.select_dtypes(include=cat).columns

''' 
rm=['emp_length','zip_code','last_week_pay']
cat_col=[c for c in cat_col if c not in rm]
cat_col
Exemplo n.º 40
0
              PS. λ是待定变换参数,一般为λ=0,1/2,-1 ,scipy.stats默认λ是None
'''


with tqdm(total=train_num_encode.shape[1],desc='Transforming',unit='cols') as pbar:
    # 迭代每一个数值型特征
    for col in range(train_num_encode.shape[1]):
        # 获取特征下所有数据(包括测试集和训练集)
        values = np.hstack((train_num[:,col],test_num[:,col]))
        
        # 获取该特征下所有数据的不对称度程度
        sk = skew(values)
        
        if sk > 0.25:
            # box-cox处理
            values_enc,lam = boxcox(values+1) 
            train_num_encode[:,col] = values_enc[:train_num.shape[0]]
            test_num_encode[:,col] = values_enc[train_num.shape[0]:]
        else:
            # 不处理
            train_num_encode[:,col] = train_num[:,col]
            test_num_encode[:,col] = test_num[:,col]
            
        pbar.update(1)
        

print('saving...')

# 保存特征
Dataset.save_part_feature('numeric_boxcox',Dataset.get_part_feature('numeric'))
# 保存数据
Exemplo n.º 41
0
def plot_df_transformed(df: pd.DataFrame,
                        filters: tuple,
                        zero_values: str = "ignore") -> None:
    """
    Plots three graphs for each variable satisfying the filters; original distribution,
    transformed distribution and transformed probability plot.
    :param df: supplied dataframe
    :param zero_values: either "ignore" or "increment"
    :param filters: tuple containing filter strings for columns
    :return:
    """
    to_plot = filtered_col_list(df, strings=filters)

    height = ceil(len(to_plot) * 10 / 3)

    fig = plt.figure(figsize=(12, height))

    count = 0
    for var in to_plot:
        try:
            if zero_values == "ignore":
                series = pd.Series(df[np.abs(df[var]) > 0][var])
            elif zero_values == "increment":
                series = pd.Series(df[var] + 0.0001)

            ax1 = plt.subplot2grid((len(to_plot), 3), (count, 0),
                                   colspan=1,
                                   rowspan=1)
            ax1.get_xaxis().set_major_formatter(
                FuncFormatter(lambda x, p: format(int(x), ',')))
            plt.yticks([], [])
            sns.distplot(series, fit=norm)
            plt.title("Distribution: Original Data")
            plt.ylabel("Density")

            series_transformed = boxcox(np.abs(np.asarray(series.values)))[0]

            ax2 = plt.subplot2grid((len(to_plot), 3), (count, 1),
                                   rowspan=1,
                                   colspan=1)
            ax2.get_xaxis().set_major_formatter(
                FuncFormatter(lambda x, p: format(int(x), ',')))
            sns.distplot(series_transformed, fit=norm)
            plt.xlabel(var)
            plt.title("Distribution: Boxcox Data")
            plt.yticks([], [])
            plt.ylabel("Density")

            ax3 = plt.subplot2grid((len(to_plot), 3), (count, 2),
                                   colspan=1,
                                   rowspan=1)
            ax3.get_yaxis().set_major_formatter(
                FuncFormatter(lambda x, p: format(int(x), ',')))
            stats.probplot(series_transformed, plot=ax3)
            plt.title("Probability Plot: Boxcox Data")
            plt.xlabel(var)
            plt.yticks([], [])

        except Exception as e:
            print(var, e)

        count += 1

    plt.subplots_adjust(hspace=0.2)
    plt.tight_layout()

    plt.show()
Exemplo n.º 42
0
def boxcoxtransform(*column_names, add=0):
    for colname in column_names:
        bc_xform_values, bc_lambda = stats.boxcox(super_df[colname] + add)
        print('BoxCox Transform ', colname, ' with lambda: ', bc_lambda)
        for df in combine_df:
            df[colname] = stats.boxcox(df[colname] + add, bc_lambda)
Exemplo n.º 43
0
    def feature_engineering(self):
        # combine train and test datas in to one dataframe
        df_all = pd.concat([self.df_csv_train, self.df_csv_test])
        print('train.shape=', self.df_csv_train.shape, ', test.shape=',
              self.df_csv_test.shape)

        cols_with_na = ProcessData.get_cols_with_na(
            df_all.drop('SalePrice', axis=1))
        print(cols_with_na.sort_values(ascending=False).to_string())

        # 1.Meaningful NA Values #########
        # columns where NA values have meaning, e.g. no pool, no basement, etc.
        cols_fillna = [
            'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType',
            'FireplaceQu', 'GarageQual', 'GarageCond', 'GarageFinish',
            'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual',
            'BsmtFinType1', 'BsmtFinType2'
        ]

        # replace 'NA' with 'None' in these columns
        for col in cols_fillna:
            df_all[col].fillna('None', inplace=True)

        # GarageYrBlt nans: no garage. Fill with property YearBuilt.
        # (more appropriate than 0, which would be ~2000 away from all other values)
        df_all.loc[df_all.GarageYrBlt.isnull(),
                   'GarageYrBlt'] = df_all.loc[df_all.GarageYrBlt.isnull(),
                                               'YearBuilt']

        # No masonry veneer - fill area with 0
        df_all.MasVnrArea.fillna(0, inplace=True)

        # No basement - fill areas/counts with 0
        df_all.BsmtFullBath.fillna(0, inplace=True)
        df_all.BsmtHalfBath.fillna(0, inplace=True)
        df_all.BsmtFinSF1.fillna(0, inplace=True)
        df_all.BsmtFinSF2.fillna(0, inplace=True)
        df_all.BsmtUnfSF.fillna(0, inplace=True)
        df_all.TotalBsmtSF.fillna(0, inplace=True)

        # No garage - fill areas/counts with 0
        df_all.GarageArea.fillna(0, inplace=True)
        df_all.GarageCars.fillna(0, inplace=True)

        # 2.LotFrontage NA Values #########
        # LotFrontage
        # fill NA values using a linear regressor

        # convert categoricals to dummies, exclude SalePrice from model
        df_frontage = pd.get_dummies(df_all.drop('SalePrice', axis=1))

        # normalise columns to 0-1
        for col in df_frontage.drop('LotFrontage', axis=1).columns:
            df_frontage[col] = ProcessData.scale_minmax(df_frontage[col])

        lf_train = df_frontage.dropna()
        lf_train_y = lf_train.LotFrontage
        lf_train_X = lf_train.drop('LotFrontage', axis=1)

        # fit model
        lr = Ridge()
        lr.fit(lf_train_X, lf_train_y)

        # check model results
        lr_coefs = pd.Series(lr.coef_, index=lf_train_X.columns)

        print('----------------')
        print('Intercept:', lr.intercept_)
        print('----------------coefficient: head(10)')
        print(lr_coefs.sort_values(ascending=False).head(10))
        print('----------------coefficient: tail(10)')
        print(lr_coefs.sort_values(ascending=False).tail(10))
        print('----------------')
        print('R2:', lr.score(lf_train_X, lf_train_y))
        print('----------------')

        # fill na values using model predictions
        na_frontage = df_all.LotFrontage.isnull()
        X = df_frontage[na_frontage].drop('LotFrontage', axis=1)
        y = lr.predict(X)

        # fill na values
        df_all.loc[na_frontage, 'LotFrontage'] = y

        # 3.Remaining NaNs #########
        print(cols_with_na.sort_values(ascending=False).to_string())

        rows_with_na = df_all.drop('SalePrice', axis=1).isnull().sum(axis=1)
        rows_with_na = rows_with_na[rows_with_na > 0]
        print(rows_with_na.sort_values(ascending=False).to_string())

        # fill remaining NA with mode in that column
        for col in cols_with_na.index:
            df_all[col].fillna(df_all[col].mode()[0], inplace=True)

        # Now no more NaN values
        df_all.info()

        # 4.Basement Finish Types #########
        # create separate columns for area of each possible
        # basement finish type
        bsmt_fin_cols = ['BsmtGLQ', 'BsmtALQ', 'BsmtBLQ', 'BsmtRec', 'BsmtLwQ']

        for col in bsmt_fin_cols:
            # initialise as columns of zeros
            df_all[col + 'SF'] = 0

        # fill remaining finish type columns
        for row in df_all.index:
            fin1 = df_all.loc[row, 'BsmtFinType1']
            if (fin1 != 'None') and (fin1 != 'Unf'):
                # add area (SF) to appropriate column
                df_all.loc[row,
                           'Bsmt' + fin1 + 'SF'] += df_all.loc[row,
                                                               'BsmtFinSF1']

            fin2 = df_all.loc[row, 'BsmtFinType2']
            if (fin2 != 'None') and (fin2 != 'Unf'):
                df_all.loc[row,
                           'Bsmt' + fin2 + 'SF'] += df_all.loc[row,
                                                               'BsmtFinSF2']

        # remove initial BsmtFin columns
        df_all.drop(
            ['BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2'],
            axis=1,
            inplace=True)

        # already have BsmtUnf column in dataset
        bsmt_fin_cols.append('BsmtUnf')

        # also create features representing the fraction of the basement that is each finish type
        for col in bsmt_fin_cols:
            df_all[col + 'Frac'] = df_all[col + 'SF'] / df_all['TotalBsmtSF']
            # replace any NA with zero (for properties without a basement)
            df_all[col + 'Frac'].fillna(0, inplace=True)

        # 5.1st and 2nd Floor Area #########
        df_all['LowQualFinFrac'] = df_all['LowQualFinSF'] / df_all['GrLivArea']
        df_all['1stFlrFrac'] = df_all['1stFlrSF'] / df_all['GrLivArea']
        df_all['2ndFlrFrac'] = df_all['2ndFlrSF'] / df_all['GrLivArea']
        df_all['TotalAreaSF'] = df_all['GrLivArea'] + df_all['TotalBsmtSF'] + df_all['GarageArea'] + df_all[
            'EnclosedPorch'] + \
                                df_all['ScreenPorch']
        df_all['LivingAreaSF'] = df_all['1stFlrSF'] + df_all['2ndFlrSF'] + df_all['BsmtGLQSF'] + df_all['BsmtALQSF'] + \
                                 df_all[
                                     'BsmtBLQSF']
        df_all['StorageAreaSF'] = df_all['LowQualFinSF'] + df_all['BsmtRecSF'] + df_all['BsmtLwQSF'] + df_all[
            'BsmtUnfSF'] + \
                                  df_all['GarageArea']

        # 6.Categorical Features with Meaningful Ordering #########
        # convert some categorical values to numeric scales

        # Excellent, Good, Typical, Fair, Poor, None: Convert to 0-5 scale
        cols_ExGd = [
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ]

        dict_ExGd = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}

        for col in cols_ExGd:
            df_all[col].replace(dict_ExGd, inplace=True)

        print(df_all[cols_ExGd].head(5))

        # Remaining columns
        df_all['BsmtExposure'].replace(
            {
                'Gd': 4,
                'Av': 3,
                'Mn': 2,
                'No': 1,
                'None': 0
            }, inplace=True)
        df_all['CentralAir'].replace({'Y': 1, 'N': 0}, inplace=True)
        df_all['Functional'].replace(
            {
                'Typ': 7,
                'Min1': 6,
                'Min2': 5,
                'Mod': 4,
                'Maj1': 3,
                'Maj2': 2,
                'Sev': 1,
                'Sal': 0
            },
            inplace=True)
        df_all['GarageFinish'].replace(
            {
                'Fin': 3,
                'RFn': 2,
                'Unf': 1,
                'None': 0
            }, inplace=True)
        df_all['LotShape'].replace({
            'Reg': 3,
            'IR1': 2,
            'IR2': 1,
            'IR3': 0
        },
                                   inplace=True)
        df_all['Utilities'].replace(
            {
                'AllPub': 3,
                'NoSewr': 2,
                'NoSeWa': 1,
                'ELO': 0
            }, inplace=True)
        df_all['LandSlope'].replace({
            'Gtl': 2,
            'Mod': 1,
            'Sev': 0
        },
                                    inplace=True)

        # 7.Dealing with Zeros #########
        # fraction of zeros in each column
        frac_zeros = ((df_all == 0).sum() / len(df_all))

        # no. unique values in each column
        n_unique = df_all.nunique()

        # difference between frac. zeros and expected
        # frac. zeros if values evenly distributed between
        # classes
        xs_zeros = frac_zeros - 1 / n_unique

        # create dataframe and display which columns may be problematic
        zero_cols = pd.DataFrame({
            'frac_zeros': frac_zeros,
            'n_unique': n_unique,
            'xs_zeros': xs_zeros
        })
        zero_cols = zero_cols[zero_cols.frac_zeros > 0]
        zero_cols.sort_values(by='xs_zeros', ascending=False, inplace=True)
        print(zero_cols[(zero_cols.xs_zeros > 0)])

        # very few properties with Pool or 3SsnPorch
        # replace columns with binary indicator
        df_all['HasPool'] = (df_all['PoolQC'] > 0).astype(int)
        df_all['Has3SsnPorch'] = (df_all['3SsnPorch'] > 0).astype(int)
        df_all.drop(['PoolQC', 'PoolArea', '3SsnPorch'], axis=1, inplace=True)

        # 'half' bathrooms - add half value to 'full' bathrooms
        df_all['BsmtFullBath'] = df_all[
            'BsmtFullBath'] + 0.5 * df_all['BsmtHalfBath']
        df_all['FullBath'] = df_all['FullBath'] + 0.5 * df_all['HalfBath']
        df_all.drop(['BsmtHalfBath', 'HalfBath'], axis=1, inplace=True)

        # create additional dummy variable for
        # continuous variables with a lot of zeros
        dummy_cols = [
            'LowQualFinSF', '2ndFlrSF', 'MiscVal', 'ScreenPorch', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', 'MasVnrArea', 'GarageArea',
            'Fireplaces', 'BsmtGLQSF', 'BsmtALQSF', 'BsmtBLQSF', 'BsmtRecSF',
            'BsmtLwQSF', 'BsmtUnfSF', 'TotalBsmtSF'
        ]

        for col in dummy_cols:
            df_all['Has' + col] = (df_all[col] > 0).astype(int)

        # 8.Log Transform SalePrice #########
        # Log Transform SalePrice to improve normality
        sp = df_all.SalePrice
        df_all.SalePrice = np.log(sp)

        print(df_all.SalePrice.describe())

        # 9.Identify Types of Features #########
        # extract names of numeric columns
        dtypes = df_all.dtypes
        cols_numeric = dtypes[dtypes != object].index.tolist()

        # MSubClass should be treated as categorical
        cols_numeric.remove('MSSubClass')

        # choose any numeric column with less than 13 values to be
        # "discrete". 13 chosen to include months of the year.
        # other columns "continuous"
        col_nunique = dict()

        for col in cols_numeric:
            col_nunique[col] = df_all[col].nunique()

        col_nunique = pd.Series(col_nunique)

        cols_discrete = col_nunique[col_nunique < 13].index.tolist()
        cols_continuous = col_nunique[col_nunique >= 13].index.tolist()

        print(len(cols_numeric), 'numeric columns, of which',
              len(cols_continuous), 'are continuous and', len(cols_discrete),
              'are discrete.')

        # extract names of categorical columns
        cols_categ = dtypes[~dtypes.index.isin(cols_numeric)].index.tolist()

        for col in cols_categ:
            df_all[col] = df_all[col].astype('category')

        print(len(cols_categ), 'categorical columns.')

        # 10.Correlation Between Numeric Features #########
        # correlation between numeric variables
        df_corr = df_all.loc[self.id_train,
                             cols_numeric].corr(method='spearman').abs()

        # order columns and rows by correlation with SalePrice
        df_corr = df_corr.sort_values(
            'SalePrice', axis=0, ascending=False).sort_values('SalePrice',
                                                              axis=1,
                                                              ascending=False)

        print(df_corr.SalePrice.head(20))
        print('-----------------')
        print(df_corr.SalePrice.tail(10))

        # 11.Normalise Numeric Features #########
        # normalise numeric columns
        scale_cols = [col for col in cols_numeric if col != 'SalePrice']
        df_all[scale_cols] = df_all[scale_cols].apply(ProcessData.scale_minmax,
                                                      axis=0)
        df_all[scale_cols].describe()

        # 12.Box-Cox Transform Suitable Variables #########
        # variables not suitable for box-cox transformation (usually due to excessive zeros)
        cols_notransform = [
            '2ndFlrSF', '1stFlrFrac', '2ndFlrFrac', 'StorageAreaSF',
            'EnclosedPorch', 'LowQualFinSF', 'MasVnrArea', 'MiscVal',
            'ScreenPorch', 'OpenPorchSF', 'WoodDeckSF', 'SalePrice',
            'BsmtGLQSF', 'BsmtALQSF', 'BsmtBLQSF', 'BsmtRecSF', 'BsmtLwQSF',
            'BsmtUnfSF', 'BsmtGLQFrac', 'BsmtALQFrac', 'BsmtBLQFrac',
            'BsmtRecFrac', 'BsmtLwQFrac', 'BsmtUnfFrac'
        ]

        cols_transform = [
            col for col in cols_continuous if col not in cols_notransform
        ]

        # transform remaining variables
        print('Transforming', len(cols_transform), 'columns:', cols_transform)

        for col in cols_transform:
            # transform column
            df_all.loc[:, col], _ = stats.boxcox(df_all.loc[:, col] + 1)

            # renormalise column
            df_all.loc[:, col] = ProcessData.scale_minmax(df_all.loc[:, col])

        # 13.Prepare Data for Model Fitting #########
        # select features, encode categoricals, create dataframe for model fitting

        # select which features to use (all for now)
        model_cols = df_all.columns

        # encode categoricals
        self.df_model = pd.get_dummies(df_all[model_cols])

        # Rather than including Condition1 and Condition2, or Exterior1st and Exterior2nd,
        # combine the dummy variables (allowing 2 true values per property)
        if ('Condition1' in model_cols) and ('Condition2' in model_cols):

            cond_suffix = [
                'Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn',
                'RRNn'
            ]

            for suffix in cond_suffix:
                col_cond1 = 'Condition1_' + suffix
                col_cond2 = 'Condition2_' + suffix

                self.df_model[col_cond1] = self.df_model[
                    col_cond1] | self.df_model[col_cond2]
                self.df_model.drop(col_cond2, axis=1, inplace=True)

        if ('Exterior1st' in model_cols) and ('Exterior2nd' in model_cols):

            # some different strings in Exterior1st and Exterior2nd for same type - rename columns to correct
            self.df_model.rename(columns={
                'Exterior2nd_Wd Shng': 'Exterior2nd_WdShing',
                'Exterior2nd_Brk Cmn': 'Exterior2nd_BrkComm',
                'Exterior2nd_CmentBd': 'Exterior2nd_CemntBd'
            },
                                 inplace=True)

            ext_suffix = [
                'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
                'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
                'VinylSd', 'Wd Sdng', 'WdShing', 'AsbShng'
            ]

            for suffix in ext_suffix:
                col_cond1 = 'Exterior1st_' + suffix
                col_cond2 = 'Exterior2nd_' + suffix

                self.df_model[col_cond1] = self.df_model[
                    col_cond1] | self.df_model[col_cond2]
                self.df_model.drop(col_cond2, axis=1, inplace=True)

        print(self.df_model.head())

        # 14.Identify and Remove Outliers #########
        # get training data
        self.split_data_to_train_validation()

        # find and remove outliers using a Ridge model
        outliers = self.find_outliers(
            Ridge(), self.df_model.loc[self.id_train],
            self.df_model.loc[self.id_train].SalePrice)

        # permanently remove these outliers from the data
        self.df_model = self.df_model.drop(outliers)
        self.id_train = self.id_train.drop(outliers)
Exemplo n.º 44
0
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos', 'display_address', 'street_address'],
                axis=1,
                inplace=True)

categoricals = [
    x for x in train_test.columns if train_test[x].dtype == 'object'
]

for feat in categoricals:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))

bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price

train_test.drop('price', axis=1, inplace=True)
train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))

train_test['bathrooms_cat'], labels = pd.factorize(
    train_test['bathrooms_cat'].values, sort=True)
train_test.drop('bathrooms', axis=1, inplace=True)

train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values,
                                                 sort=True)
train_test.drop('bedrooms', axis=1, inplace=True)

features = list(train_test.columns)
Exemplo n.º 45
0
    def _predict(self, h=None, smoothing_level=None, smoothing_slope=None,
                 smoothing_seasonal=None, initial_level=None, initial_slope=None,
                 damping_slope=None, initial_seasons=None, use_boxcox=None, lamda=None,
                 remove_bias=None, is_optimized=None):
        """
        Helper prediction function

        Parameters
        ----------
        h : int, optional
            The number of time steps to forecast ahead.
        """
        # Variable renames to alpha, beta, etc as this helps with following the
        # mathematical notation in general
        alpha = smoothing_level
        beta = smoothing_slope
        gamma = smoothing_seasonal
        phi = damping_slope

        # Start in sample and out of sample predictions
        data = self.endog
        damped = self.damped
        seasoning = self.seasoning
        trending = self.trending
        trend = self.trend
        seasonal = self.seasonal
        m = self.seasonal_periods
        phi = phi if damped else 1.0
        if use_boxcox == 'log':
            lamda = 0.0
            y = boxcox(data, 0.0)
        elif isinstance(use_boxcox, float):
            lamda = use_boxcox
            y = boxcox(data, lamda)
        elif use_boxcox:
            y, lamda = boxcox(data)
        else:
            lamda = None
            y = data.squeeze()
            if np.ndim(y) != 1:
                raise NotImplementedError('Only 1 dimensional data supported')
        y_alpha = np.zeros((self.nobs,))
        y_gamma = np.zeros((self.nobs,))
        alphac = 1 - alpha
        y_alpha[:] = alpha * y
        if trending:
            betac = 1 - beta
        if seasoning:
            gammac = 1 - gamma
            y_gamma[:] = gamma * y
        lvls = np.zeros((self.nobs + h + 1,))
        b = np.zeros((self.nobs + h + 1,))
        s = np.zeros((self.nobs + h + m + 1,))
        lvls[0] = initial_level
        b[0] = initial_slope
        s[:m] = initial_seasons
        phi_h = np.cumsum(np.repeat(phi, h + 1)**np.arange(1, h + 1 + 1)
                          ) if damped else np.arange(1, h + 1 + 1)
        trended = {'mul': np.multiply,
                   'add': np.add,
                   None: lambda l, b: l
                   }[trend]
        detrend = {'mul': np.divide,
                   'add': np.subtract,
                   None: lambda l, b: 0
                   }[trend]
        dampen = {'mul': np.power,
                  'add': np.multiply,
                  None: lambda b, phi: 0
                  }[trend]
        nobs = self.nobs
        if seasonal == 'mul':
            for i in range(1, nobs + 1):
                lvls[i] = y_alpha[i - 1] / s[i - 1] + \
                       (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi)))
                if trending:
                    b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \
                           (betac * dampen(b[i - 1], phi))
                s[i + m - 1] = y_gamma[i - 1] / trended(lvls[i - 1], dampen(b[i - 1], phi)) + \
                    (gammac * s[i - 1])
            slope = b[1:nobs + 1].copy()
            season = s[m:nobs + m].copy()
            lvls[nobs:] = lvls[nobs]
            if trending:
                b[:nobs] = dampen(b[:nobs], phi)
                b[nobs:] = dampen(b[nobs], phi_h)
            trend = trended(lvls, b)
            s[nobs + m - 1:] = [s[(nobs - 1) + j % m] for j in range(h + 1 + 1)]
            fitted = trend * s[:-m]
        elif seasonal == 'add':
            for i in range(1, nobs + 1):
                lvls[i] = y_alpha[i - 1] - (alpha * s[i - 1]) + \
                       (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi)))
                if trending:
                    b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \
                           (betac * dampen(b[i - 1], phi))
                s[i + m - 1] = y_gamma[i - 1] - \
                    (gamma * trended(lvls[i - 1], dampen(b[i - 1], phi))) + \
                    (gammac * s[i - 1])
            slope = b[1:nobs + 1].copy()
            season = s[m:nobs + m].copy()
            lvls[nobs:] = lvls[nobs]
            if trending:
                b[:nobs] = dampen(b[:nobs], phi)
                b[nobs:] = dampen(b[nobs], phi_h)
            trend = trended(lvls, b)
            s[nobs + m - 1:] = [s[(nobs - 1) + j % m] for j in range(h + 1 + 1)]
            fitted = trend + s[:-m]
        else:
            for i in range(1, nobs + 1):
                lvls[i] = y_alpha[i - 1] + \
                       (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi)))
                if trending:
                    b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \
                           (betac * dampen(b[i - 1], phi))
            slope = b[1:nobs + 1].copy()
            season = s[m:nobs + m].copy()
            lvls[nobs:] = lvls[nobs]
            if trending:
                b[:nobs] = dampen(b[:nobs], phi)
                b[nobs:] = dampen(b[nobs], phi_h)
            trend = trended(lvls, b)
            fitted = trend
        level = lvls[1:nobs + 1].copy()
        if use_boxcox or use_boxcox == 'log' or isinstance(use_boxcox, float):
            fitted = inv_boxcox(fitted, lamda)
            level = inv_boxcox(level, lamda)
            slope = detrend(trend[:nobs], level)
            if seasonal == 'add':
                season = (fitted - inv_boxcox(trend, lamda))[:nobs]
            else:  # seasonal == 'mul':
                season = (fitted / inv_boxcox(trend, lamda))[:nobs]
        sse = sqeuclidean(fitted[:-h - 1], data)
        # (s0 + gamma) + (b0 + beta) + (l0 + alpha) + phi
        k = m * seasoning + 2 * trending + 2 + 1 * damped
        aic = self.nobs * np.log(sse / self.nobs) + k * 2
        if self.nobs - k - 3 > 0:
            aicc_penalty = (2 * (k + 2) * (k + 3)) / (self.nobs - k - 3)
        else:
            aicc_penalty = np.inf
        aicc = aic + aicc_penalty
        bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
        resid = data - fitted[:-h - 1]
        if remove_bias:
            fitted += resid.mean()
        self.params = {'smoothing_level': alpha,
                       'smoothing_slope': beta,
                       'smoothing_seasonal': gamma,
                       'damping_slope': phi if damped else np.nan,
                       'initial_level': lvls[0],
                       'initial_slope': b[0] / phi,
                       'initial_seasons': s[:m],
                       'use_boxcox': use_boxcox,
                       'lamda': lamda,
                       'remove_bias': remove_bias}

        # Format parameters into a DataFrame
        codes = ['alpha', 'beta', 'gamma', 'l.0', 'b.0', 'phi']
        codes += ['s.{0}'.format(i) for i in range(m)]
        idx = ['smoothing_level', 'smoothing_slope', 'smoothing_seasonal',
               'initial_level', 'initial_slope', 'damping_slope']
        idx += ['initial_seasons.{0}'.format(i) for i in range(m)]

        formatted = [alpha, beta, gamma, lvls[0], b[0], phi]
        formatted += s[:m].tolist()
        formatted = list(map(lambda v: np.nan if v is None else v, formatted))
        formatted = np.array(formatted)
        if is_optimized is None:
            optimized = np.zeros(len(codes), dtype=np.bool)
        else:
            optimized = is_optimized.astype(np.bool)
        included = [True, trending, seasoning, True, trending, damped]
        included += [True] * m
        formatted = pd.DataFrame([[c, f, o] for c, f, o in zip(codes, formatted, optimized)],
                                 columns=['name', 'param', 'optimized'],
                                 index=idx)
        formatted = formatted.loc[included]

        hwfit = HoltWintersResults(self, self.params, fittedfcast=fitted,
                                   fittedvalues=fitted[:-h - 1], fcastvalues=fitted[-h - 1:],
                                   sse=sse, level=level, slope=slope, season=season, aic=aic,
                                   bic=bic, aicc=aicc, resid=resid, k=k,
                                   params_formatted=formatted, optimized=optimized)
        return HoltWintersResultsWrapper(hwfit)
Exemplo n.º 46
0
 def __box_cox_transform(a):
     from scipy import stats
     boxcox, maxlog = stats.boxcox(a, lmbda=None, alpha=None)
     return boxcox
Exemplo n.º 47
0
#Separating the numeric features
df_numeric = df[[
    'subscriber', 'Trend_day_count', 'Tag_count', 'Trend_tag_count',
    'comment_count', 'likes', 'dislike'
]]
df_numeric["Trend_tag_count"] = df["Trend_tag_count"].astype('int64')
import numpy as np
df_numeric1 = np.log1p(
    df_numeric)  #log transforming the independent variables to remove skewness

#transforming the response variable
y = df['views']
from scipy import stats
from scipy.stats import boxcox
box_y = boxcox(box_y, lmbda=0.0)  #to change as normal distribution
y = y.dropna()
box_y = y.copy()

###### NlP part#####
#preprocessing text data
#removed description as it contains unwanted info
n_data = df[["channel_title", "title", "tags"]]
n_data = n_data.dropna()
n_data["tags"] = n_data["tags"].str.split('|').str.join(' ')
n_data["np"] = n_data["channel_title"] + ' ' + n_data["title"] + ' ' + n_data[
    "tags"]
nlp_data = n_data[["np"]]

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
Exemplo n.º 48
0
df = pd.read_csv(
    '/Users/nikita/PycharmProjects/ML_Tasks/course5/week1/timelines/WAG_C_M.csv',
    index_col=['month'],
    parse_dates=['month'],
    dayfirst=True,
    sep=';')
df.plot()
plt.ylabel('Salary(rubles')
plt.show()
print("Критерий Дики-Фуллера Оригинал: p=%f" %
      sm.tsa.stattools.adfuller(df.WAG_C_M)[1])
# Ряд Нестационарный, с трендом на повышение и сезонностью с периодом в год
sm.tsa.seasonal_decompose(df.WAG_C_M).plot()
plt.show()
# Сделаем преобразование Бокса-Кокса для стабилизации дисперсии:
df["salary_bxcx"], lmbda = stats.boxcox(df.WAG_C_M)
df["salary_bxcx"].plot()
plt.ylabel(u'Transformed Salaries')
print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda)
print("Критерий Дики-Фуллера После преобразования Бокса-Кокса: p=%f" %
      sm.tsa.stattools.adfuller(df["salary_bxcx"])[1])
plt.show()
# Ряд сгладился, но все ще существенно не стационарный
# Попробуем сезонное дифференцирование c Сезонным лагом 12
df['salary_box_diff'] = df.salary_bxcx - df.salary_bxcx.shift(12)
sm.tsa.seasonal_decompose(df.salary_box_diff[12:]).plot()
print(
    "Критерий Дики-Фуллера преобразования Бокса-Кокса и Сезонного дифференцирования: p=%f"
    % sm.tsa.stattools.adfuller(df.salary_box_diff[12:])[1])
plt.show()
# Критерий Дики-Фуллера <0.05 но тренд виден
Exemplo n.º 49
0
def normalize_data(ingredient_attribute, prediction_attributes):
    """
    My clusters were not differenciating well, and after some reading, I realized that this was because the data is not a
    normal/gaussian distribution. GMM expects the input data to follow a normal distribution

    Testing distribution of the features:

    The plots illustrustrate the non-normal distribution of each column. They appear to be more geometric or loglike,
    so we need to normalize them. Reference: http://www.kmdatascience.com/2017/07/box-cox-transformations-in-python.html

    After some more reading and testing, I learned about the box-cox transformation method and decided to use it to transform
    the columns.
    Commenting out the plot in the for loop for submission because sometimes it leads to memory issues with so many plots.

    Sci-py has some built -in box cox transformations, so I decided to use that.

    """
    for i in ingredient_attribute[prediction_attributes].columns:
        """plot = sns.distplot(ingredient_attribute[i], hist=True, kde=False,
                 bins=int((ingredient_attribute[i].max() - ingredient_attribute[i].min())/10), color='green',
                 hist_kws={'edgecolor':'black'})
        # clear the plot otherwise each iteration of the loop will place a new graph on top
        plt.clf()
        """

        # clean the data  --
        # from Open Food Facts: https://static.openfoodfacts.org/data/data-fields.txt
        # "fields that end with _100g correspond to the amount of a nutriment (in g, or kJ for energy) for 100 g or 100 ml of product"
        if i.endswith("_100g") and np.issubdtype(ingredient_attribute[i].dtype,
                                                 np.number):
            # based on the field description, the range of these columns = 0 to 100
            # drop columns with neg values
            ingredient_attribute = ingredient_attribute.drop(
                ingredient_attribute[ingredient_attribute[i] < 0].index)
            # and those > 100
            ingredient_attribute = ingredient_attribute.drop(
                ingredient_attribute[ingredient_attribute[i] > 100].index)

            # get values to transform
            transform = np.asarray(ingredient_attribute[i].values)
            # boxcox requires strictly positive values (> 0), so resetting zeros to a small pos #
            getZeros = transform[transform < 1] = 1
            """
        I found this example on Kaggle which looks very similar to the what I am trying to do:
        https://www.kaggle.com/allunia/hidden-treasures-in-our-groceries
        
        The author mentions that the lambda values are very important, so I decided to compare what mine were:
        
        1) energy | 0.7 | 0.617
        2) carbs  | 0.9 | -0.346
        3) fat    | 0.5 | -82.03
        4) protein| 0.1 | -6.44
        5) sugar  | 0.03 | - 1.37
        6) salt   | 0.005 | -2.44
        
        Trying out different ones did not help with the sihouette score in my case, so I decided to leave the default
         
        The author also mentions using only 3 different variable for the clusters, so I decided to use the ones that 
        required the least amount of normalization (energy, carbs and sugar). These three are also the ones with the 
        lowest % of zeros.
            
            """
            output = stats.boxcox(transform)
            transformed_data = output[0]

            # save back the transformed data
            ingredient_attribute[i] = transformed_data

            # how many bins should the histogram plot have? calculated using range of each column
            bins = int((transformed_data.max() - transformed_data.min()) / 10)
            if bins < 3:
                bins = 5
        """
        ## replot to see the difference
        plot = sns.distplot(transformed_data, hist=True, kde=False,
                 bins= bins, color='orange',
                 hist_kws={'edgecolor':'black'})

        # and clear again
        plt.clf()
        """
    ingredient_attribute.to_csv("./data/transformed_data_us.csv")
    return ingredient_attribute
Exemplo n.º 50
0
    def box_cox(self, data_df, group, attribute):
        data_df[attribute] = data_df.groupby(group)[attribute].transform(
            lambda x: scipy.boxcox(x)[0])

        return data_df
Exemplo n.º 51
0
print("## Extracting Box-Cox features ")
# print("## Plot Transformations for **amount**:")
# figure = plt.figure(figsize=(16, 5))
# figure.add_subplot(131)
# plt.title("Amount Histogram")
# plt.hist(tmpData['amount'] ,facecolor='blue',alpha=0.75)
# plt.xlabel("Transaction amount")
# plt.text(10,100000,"Skewness: {0:.2f}".format(skew(tmpData['amount'])))
# figure.add_subplot(132)
# plt.title("SQRT on amount histogram")
# plt.hist(np.sqrt(tmpData['amount']), facecolor = 'red', alpha=0.5)
# plt.xlabel("Square root of amount")
# plt.text(10, 100000, "Skewness: {0:.2f}".format(skew(np.sqrt(tmpData['amount']))))

tmpData['amount_boxcox'] = preprocessing.scale(
    boxcox(tmpData['amount'] + 1)[0])
# figure.add_subplot(133)
# plt.title("Box-cox on amount histogram")
# plt.hist(tmpData['amount_boxcox'], facecolor = 'red', alpha=0.5)
# plt.xlabel("Box cox of amount")
# plt.text(10, 100000, "Skewness: {0:.2f}".format(skew(tmpData['amount_boxcox'])))
# plt.show()
# High skewness on left side but box-cox reveals normal distribution

# print("## Plot Transformations for **oldbalanceOrg**:")
# figure = plt.figure(figsize=(16, 5))
# figure.add_subplot(131)
# plt.title("oldbalanceOrg Histogram")
# plt.hist(tmpData['oldbalanceOrg'] ,facecolor='blue',alpha=0.75)
# plt.xlabel("old balance originated")
# plt.text(2,100000,"Skewness: {0:.2f}".format(skew(tmpData['oldbalanceOrg'])))
Exemplo n.º 52
0
    def redraw(self):
        if self.XcomboBox.currentText() == self.YcomboBox.currentText():
            QtWidgets.QMessageBox.critical(self, 'Error',
                                           'Variables \n must be different !',
                                           QtWidgets.QMessageBox.Ok)
            return ()
        if (self.XcomboBox.currentText() == 'Auto') and (
                self.YcomboBox.currentText()
                == 'All') and not self.scatterradioButton.isChecked():
            QtWidgets.QMessageBox.critical(
                self, 'Error',
                "You have to select two rows \n for this kind of plot!",
                QtWidgets.QMessageBox.Ok)
            return ()
        data = DS.Raw.iloc[DS.Ir, DS.Ic]
        data = data.assign(Lr=DS.Lr[DS.Ir])
        data = data.assign(Cr=DS.Cr[DS.Ir])
        data = data.assign(Gr=DS.Gr[DS.Ir])
        if (self.XcomboBox.currentText() !=
                'Auto') and (self.YcomboBox.currentText() != 'All'):
            data = data.loc[[
                self.XcomboBox.currentText(),
                self.YcomboBox.currentText()
            ]]
        elif (self.XcomboBox.currentText() !=
              'Auto') and (self.YcomboBox.currentText() == 'All'):
            QtWidgets.QMessageBox.critical(self, 'Error', "Select two rows!",
                                           QtWidgets.QMessageBox.Ok)
            return ()
        elif (self.XcomboBox.currentText()
              == 'Auto') and (self.YcomboBox.currentText() != 'All'):
            QtWidgets.QMessageBox.critical(self, 'Error',
                                           "Use Univariate plot!",
                                           QtWidgets.QMessageBox.Ok)
            return ()
        Nnan = data.isnull().isnull().all().all()
        data = data.T.dropna()
        data = data.T
        Lr = data['Lr'].values
        Cr = data['Cr'].values
        Gr = data['Gr'].values
        data = data.drop('Lr', axis=1)
        data = data.drop('Cr', axis=1)
        data = data.drop('Gr', axis=1)
        if data.dtypes.all() == 'float' and data.dtypes.all() == 'int':
            QtWidgets.QMessageBox.critical(self,'Error',"Some values are not numbers!",\
                                           QtWidgets.QMessageBox.Ok)
            return ()
        if (self.XcomboBox.currentText() !=
                'Auto') and (self.YcomboBox.currentText() != 'All'):
            if data.shape[0] != 2:
                QtWidgets.QMessageBox.critical(self,'Error',"Raw labels must be different",\
                                               QtWidgets.QMessageBox.Ok)
                return ()
            x = data.loc[self.XcomboBox.currentText()].values
            y = data.loc[self.YcomboBox.currentText()].values
        fig = Figure()
        ax = fig.add_subplot(111)
        color = 'blue'
        if self.scatterradioButton.isChecked():
            if (self.XcomboBox.currentText() !=
                    'Auto') and (self.YcomboBox.currentText() != 'All'):
                if self.PcheckBox.isChecked():
                    ax.scatter(x, y, marker='o', color=Cr)
                if self.LcheckBox.isChecked():
                    ax.plot(x, y, color='blue')
                if self.VcheckBox.isChecked():
                    for i, txt in enumerate(Lr):
                        ax.annotate(txt, (x[i], y[i]))
                ax.set_xlabel(self.XcomboBox.currentText())
                ax.set_ylabel(self.YcomboBox.currentText())
            else:
                nr, nc = data.shape
                Lc = DS.Lc[DS.Ic]
                x = range(1, nc + 1)
                color = Cr
                if self.GcheckBox.isChecked():
                    groups = Gr
                    ngr = len(np.unique(groups))
                    color = []
                    for key in groups:
                        color.append(cm.viridis.colors[int(
                            (len(cm.viridis.colors) - 1) / ngr * key)])
                for i in range(nr):
                    y = data.iloc[i, :]
                    col = color[i]
                    if self.GcomboBox.currentText() == 'All':
                        if self.PcheckBox.isChecked():
                            ax.scatter(x, y, marker='o', color=col)
                        if self.LcheckBox.isChecked():
                            ax.plot(x, y, color=col)
                    else:
                        if int(self.GcomboBox.currentText()) == groups[i]:
                            if self.PcheckBox.isChecked():
                                ax.scatter(x, y, marker='o', color=col)
                            if self.LcheckBox.isChecked():
                                ax.plot(x, y, color=col)
                if (nc > 30):
                    itick = np.linspace(0, nc - 1, 20).astype(int)
                    ltick = Lc[itick]
                else:
                    itick = x
                    ltick = Lc
                ax.set_xlim([0, nc + 2])
                ax.set_xticks(itick)
                ax.set_xticklabels(ltick, rotation='vertical')
        if self.ellipseradioButton.isChecked():

            def plot_ellipse(x, y, nstd=2, ax=None, **kwargs):
                def eigsorted(cov):
                    vals, vecs = np.linalg.eigh(cov)
                    order = vals.argsort()[::-1]
                    return vals[order], vecs[:, order]

                pos = (x.mean(), y.mean())
                cov = np.cov(x, y).tolist()
                vals, vecs = eigsorted(cov)
                theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))
                width, height = 2 * nstd * np.sqrt(vals)
                ellip = Ellipse(xy=pos,
                                width=width,
                                height=height,
                                angle=theta,
                                fill=False,
                                **kwargs)
                ax.add_artist(ellip)
                return ellip

            for j in range(1, 4):
                plot_ellipse(x, y, j, ax)
            ax.scatter(x, y)
            ax.set_xlabel(self.XcomboBox.currentText())
            ax.set_ylabel(self.YcomboBox.currentText())
            ax.set_title('Ellipse for 1,2,3 times the Standard Deviation')
        if self.boxcoxradioButton.isChecked():
            if (not (x > 0).all()) and (not (y > 0).all()):
                QtWidgets.QMessageBox.critical(self,'Error',"Values must be strictly positive",\
                                               QtWidgets.QMessageBox.Ok)
                return ()
            CBC = np.zeros(50)
            vlambda = np.linspace(-2, 2, 50)
            for i in range(50):
                trans_x = stats.boxcox(x, vlambda[i])
                CBC[i] = np.corrcoef(trans_x, y)[0, 1]
            if self.PcheckBox.isChecked():
                ax.scatter(vlambda, CBC, marker='o', color=color)
            if self.LcheckBox.isChecked():
                ax.plot(vlambda, CBC, color=color)
            ax.set_xlabel('Lambda')
            ax.set_ylabel('Correlation Coefficient')
        if self.histogramradioButton.isChecked():
            cx = 'blue'
            cy = 'red'
            xm = x.mean()
            ym = y.mean()
            xstd = x.std()
            ystd = y.std()
            dy = (ym - 3 * ystd) - (xm + 3 * xstd)
            dx = (xm - 3 * xstd) - (ym + 3 * ystd)
            if (dy > 0) | (dx > 0):
                x = sk.preprocessing.normalize(x.reshape(1, -1),
                                               norm='l2',
                                               axis=1,
                                               copy=True,
                                               return_norm=False)
                y = sk.preprocessing.normalize(y.reshape(1, -1),
                                               norm='l2',
                                               axis=1,
                                               copy=True,
                                               return_norm=False)
                x = x.ravel()
                y = y.ravel()
                ax.set_xlabel('Normalized Quantities')
            iqr = np.percentile(x, [75, 25])
            iqr = iqr[0] - iqr[1]
            n = x.size
            dx = abs(max((x.max(), y.max())) - min((x.min(), y.min())))
            nbins = int(np.floor(dx / (2 * iqr) * n**(1 / 3))) + 1
            if nbins > self.spinBox.value():
                self.spinBox.setValue(nbins)
            else:
                nbins = self.spinBox.value()
            bins = np.linspace(min((x.min(), y.min())), max(
                (x.max(), y.max())), nbins)
            ax.hist(x,
                    bins=bins,
                    histtype='bar',
                    color=cx,
                    alpha=0.5,
                    orientation='vertical',
                    label=str(self.XcomboBox.currentText()))
            ax.hist(y,
                    bins=bins,
                    histtype='bar',
                    color=cy,
                    alpha=0.5,
                    orientation='vertical',
                    label=str(self.YcomboBox.currentText()))
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.legend(bbox_to_anchor=(1, 1),
                      loc='upper left',
                      borderaxespad=0.2)
        if Nnan:
            ax.annotate('{:04.2f} NaN'.format(Nnan),
                        xy=(0.80, 0.95),
                        xycoords='figure fraction')
        if self.XcheckBox.isChecked():
            if self.XlineEdit.text():
                ax.set_xlabel(self.XlineEdit.text())
        else:
            ax.set_xlabel('')
        if self.YcheckBox.isChecked():
            if self.YlineEdit.text():
                ax.set_ylabel(self.YlineEdit.text())
        else:
            ax.set_ylabel('')
        if self.XGcheckBox.isChecked():
            ax.xaxis.grid(True)
        if self.YGcheckBox.isChecked():
            ax.yaxis.grid(True)
        if self.TlineEdit.text():
            ax.set_title(self.TlineEdit.text())
        if not self.XMcheckBox.isChecked():
            ax.tick_params(axis='x',
                           which='both',
                           bottom='off',
                           top='off',
                           labelbottom='off')
        if not self.YMcheckBox.isChecked():
            ax.tick_params(axis='y',
                           which='both',
                           left='off',
                           right='off',
                           labelleft='off')
        self.rmmpl()
        self.addmpl(fig)
Exemplo n.º 53
0
    def development_score_by_draft_count(self, players_df, coaches_df):
        drafted = players_df[~players_df['drafted'].isnull()]
        drafted = drafted.groupby(['stars']).size()
        all = players_df.groupby(['stars']).size()

        drafted_fraction = pd.concat([drafted, all], axis=1)
        drafted_fraction.columns = ['drafted', 'total']
        drafted_fraction['stars'] = drafted_fraction.index
        drafted_fraction = drafted_fraction.reset_index(drop=True)
        drafted_fraction['fraction'] = drafted_fraction[
            'drafted'] / drafted_fraction['total']

        drafted_fraction.to_csv(
            os.path.join(self.output_dir, 'drafted_fraction_by_star.csv'))

        for index, coach in coaches_df.iterrows():
            players = players_df[(players_df['team'] == coach['team'])]
            cond1 = (players['drafted'] >
                     coach['first_year']) & (players['drafted'] <=
                                             (coach['last_year']) + 1)
            cond2 = ((players['enrolled'] + players['ncaaf_years']) >
                     coach['first_year']) & (players['enrolled'] <=
                                             (coach['last_year']))
            cond3 = ((players['enrolled'] + 4) >
                     coach['first_year']) & (players['enrolled'] <=
                                             (coach['last_year']))
            players = players[cond1 | cond2 | cond3]

            coach_drafted = players[~players['drafted'].isnull()]
            coach_drafted = coach_drafted.groupby(['stars']).size()
            coach_all = players.groupby(['stars']).size()

            coach_drafted_fraction = pd.concat([coach_drafted, coach_all],
                                               axis=1)
            coach_drafted_fraction.columns = ['drafted', 'total']
            coach_drafted_fraction['stars'] = coach_drafted_fraction.index
            coach_drafted_fraction = coach_drafted_fraction.reset_index(
                drop=True)
            coach_drafted_fraction['fraction'] = coach_drafted_fraction[
                'drafted'] / coach_drafted_fraction['total']
            coach_drafted_fraction = coach_drafted_fraction[[
                'stars', 'drafted', 'total', 'fraction'
            ]]

            merged = pd.merge(left=coach_drafted_fraction,
                              right=drafted_fraction,
                              how='left',
                              left_on='stars',
                              right_on='stars')

            merged = merged.dropna(subset=['total_x'])
            merged = merged[merged['total_x'] >= 5]
            merged['fraction_x'] = merged['fraction_x'].fillna(0)

            perc_diff = (merged['fraction_x'] -
                         merged['fraction_y']) / (merged['fraction_y']) * 100

            coaches_df.loc[index, 'development_ability'] = perc_diff.median()

        coaches_df = coaches_df.dropna(subset=['development_ability'])

        coaches_df = coaches_df.reset_index(drop=True)

        coaches_df['development_ability'] = coaches_df['development_ability'] + \
                                            coaches_df['development_ability'].min() * -1 + .01

        coaches_df['development_ability'] = coaches_df[
            'development_ability'].transform(lambda x: scipy.boxcox(x)[0])

        mean_percent_drafted = coaches_df['development_ability'].mean()
        std_percent_drafted = coaches_df['development_ability'].std()
        coaches_df['development_ability'] = (
            coaches_df['development_ability'] -
            mean_percent_drafted) / std_percent_drafted
        coaches_df = coaches_df.reset_index(drop=True)

        return coaches_df
plt.plot(df_month.y, '-', label='true-values_By Months')
plt.plot(df_month.original, '-', label='rew-data_By Months')
plt.legend()

# plt.tight_layout()
plt.show()

# 看趋势
plt.figure(figsize=[15, 7])
sm.tsa.seasonal_decompose(df_month.y).plot()
print("work3 test: p={}".format(adfuller(df_month.y)[1]))
# air_passengers test: p=0.996129346920727

# Box-Cox Transformations ts序列转换
df_month['y_box'], lmbda = stats.boxcox(df_month.y)
print("work3 test: p={}".format(adfuller(df_month.y_box)[1]))
# air_passengers test: p=0.7011194980409873

# Seasonal differentiation
# 季节性差分确定sax中m参数
df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12)

# Seasonal differentiation
# 季节性差分确定sax中m参数
df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12)

# Seasonal differentiation
# 季节性差分确定sax中m参数
df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12)
def price_prediction_data():

    coin_of_interest = request.args.get('CoinName')
    print('test' + coin_of_interest)

    #Connect to Amazon SQL
    conn = engine.connect()

    # Grab coin prices
    query = '''
    SELECT 
        RecordDate,
        OpenPrice,
        High,
        Low,
        ClosingPrice,
        AdjClose,
        Volume,
        c.CoinName,
        cph.TokenName
    FROM
        CoinPriceHistory cph
    INNER JOIN Coins c
        ON cph.CoinID = c.CoinID
    ORDER BY
        RecordDate
    '''
    coin_raw = pd.read_sql(query, conn)

    # Grab coin names
    query = '''
    SELECT CoinName FROM Coins
    '''
    coin_names = pd.read_sql(query, conn)

    # NEED TO IMPLEMENT: LINK TO PRICE-PREDICT.HTML
    # Get coin-of-interest from user
    # coin_of_interest = request.form['coi']

    # NEED TO IMPLMENT: PASS RESPONSE BACK TO PRICE-PREDICT.HTML
    # Determine if coin is in db
    # For now, we print it to the terminal.
    if coin_names[coin_names['CoinName'] == coin_of_interest].any().bool():
        print("We got your coin!")
    else:
        print('We NO got your coin')

    # Clean data
    coin_history = coin_raw[coin_raw['CoinName'] == coin_of_interest]
    coin_history['RecordDate'] = pd.to_datetime(coin_history['RecordDate'],
                                                errors='coerce')
    coin_history.rename(columns={
        'RecordDate': 'Timestamp',
        'OpenPrice': 'Open',
        'ClosingPrice': 'Close'
    },
                        inplace=True)
    coin_history.set_index('Timestamp', inplace=True)
    coin_history.drop(['AdjClose', 'Volume', 'CoinName', 'TokenName'],
                      axis=1,
                      inplace=True)
    coin_history = coin_history[coin_history['Close'] != 0]

    # Transform data for ARIMA
    coin_history['Box'], lmbda = stats.boxcox(coin_history.Close)
    coin_history['BoxDiff'] = coin_history.Box - coin_history.Box.shift(12)
    coin_history[
        'BoxDiff2'] = coin_history.BoxDiff - coin_history.BoxDiff.shift(1)

    # Optimize ARIMA Prediction
    Qs = range(0, 2)
    qs = range(0, 3)
    Ps = range(0, 3)
    ps = range(0, 3)
    D = 1
    d = 1
    parameters = product(ps, qs, Ps, Qs)
    parameters_list = list(parameters)
    results = []
    best_aic = float("inf")
    warnings.filterwarnings('ignore')
    #for param in parameters_list:
    param = parameters_list[0]
    try:
        model = sm.tsa.statespace.SARIMAX(coin_history.Box,
                                          order=(param[0], d, param[1]),
                                          seasonal_order=(param[2], D,
                                                          param[3],
                                                          12)).fit(disp=-1)
    except:
        print('Data cannot be conditioned for ARIMA model.  Sorry!'
              )  # Need to send this back to user
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

    #Generate Price Prediction Data
    def invboxcox(y, lmbda):
        if lmbda == 0:
            return (np.exp(y))
        else:
            return (np.exp(np.log(lmbda * y + 1) / lmbda))

    coin_history_with_predictions = coin_history[['Close']]
    coin_history_with_predictions['Forecast'] = invboxcox(
        best_model.predict(start=0,
                           end=(len(coin_history_with_predictions) - 1)),
        lmbda)
    prediction_dates = [
        datetime(2021, 4, 30),
        datetime(2021, 5, 31),
        datetime(2021, 6, 30),
        datetime(2021, 7, 31),
        datetime(2021, 8, 31),
        datetime(2021, 9, 30),
        datetime(2021, 10, 31),
        datetime(2021, 11, 30),
        datetime(2021, 12, 31)
    ]
    future = pd.DataFrame(index=prediction_dates, columns=coin_history.columns)
    future['Forecast'] = invboxcox(best_model.forecast(steps=len(future)),
                                   lmbda).tolist()
    coin_history_with_predictions = pd.concat(
        [coin_history_with_predictions, future])
    coin_history_with_predictions['Coin'] = coin_of_interest
    graph = coin_history_with_predictions.reset_index().rename(
        columns={'index': 'Date'})
    graph2 = graph[['Coin', 'Date', 'Close', 'Forecast']]

    # Return Price Prediction Data to Plotly
    _json = graph2.to_json(orient='records')
    resp = make_response(_json)
    resp.headers['content-type'] = 'application/json'

    return resp
Exemplo n.º 56
0
plt.figure(figsize=[15,7])
sm.tsa.seasonal_decompose(df_month.Weighted_Price).plot()
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.Weighted_Price)[1])
plt.show()


# The series are not stationary.

# ## Box-Cox Transformations

# In[ ]:


# Box-Cox Transformations
df_month['Weighted_Price_box'], lmbda = stats.boxcox(df_month.Weighted_Price)
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.Weighted_Price)[1])


# The series are not stationary.

# ## Seasonal differentiation

# In[ ]:


# Seasonal differentiation
df_month['prices_box_diff'] = df_month.Weighted_Price_box - df_month.Weighted_Price_box.shift(12)
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.prices_box_diff[12:])[1])

Exemplo n.º 57
0
    def fit(self):
        """
        Estimate a trend component, multiple seasonal components, and a
        residual component.

        Returns
        -------
        DecomposeResult
            Estimation results.
        """
        num_seasons = len(self.periods)
        iterate = 1 if num_seasons == 1 else self.iterate

        # Box Cox
        if self.lmbda == "auto":
            y, lmbda = boxcox(self._y, lmbda=None)
            self.est_lmbda = lmbda
        elif self.lmbda:
            y = boxcox(self._y, lmbda=self.lmbda)
        else:
            y = self._y

        # Get STL fit params
        stl_inner_iter = self._stl_kwargs.pop("inner_iter", None)
        stl_outer_iter = self._stl_kwargs.pop("outer_iter", None)

        # Iterate over each seasonal component to extract seasonalities
        seasonal = np.zeros(shape=(num_seasons, self.nobs))
        deseas = y
        for _ in range(iterate):
            for i in range(num_seasons):
                deseas = deseas + seasonal[i]
                res = STL(
                    endog=deseas,
                    period=self.periods[i],
                    seasonal=self.windows[i],
                    **self._stl_kwargs,
                ).fit(inner_iter=stl_inner_iter, outer_iter=stl_outer_iter)
                seasonal[i] = res.seasonal
                deseas = deseas - seasonal[i]

        seasonal = np.squeeze(seasonal.T)
        trend = res.trend
        rw = res.weights
        resid = deseas - trend

        # Return pandas if endog is pandas
        if isinstance(self.endog, (pd.Series, pd.DataFrame)):
            index = self.endog.index
            y = pd.Series(y, index=index, name="observed")
            trend = pd.Series(trend, index=index, name="trend")
            resid = pd.Series(resid, index=index, name="resid")
            rw = pd.Series(rw, index=index, name="robust_weight")
            cols = [f"seasonal_{period}" for period in self.periods]
            if seasonal.ndim == 1:
                seasonal = pd.Series(seasonal, index=index, name="seasonal")
            else:
                seasonal = pd.DataFrame(seasonal, index=index, columns=cols)

        # Avoid circular imports
        from statsmodels.tsa.seasonal import DecomposeResult

        return DecomposeResult(y, seasonal, trend, resid, rw)
Exemplo n.º 58
0
def lastF(y, m = 12, h = 12*2, comb = "OLS", aggList = None, include_history = True, cap = None, capF = None, \
        changepoints = None, n_changepoints = 25, yearly_seasonality = True, weekly_seasonality = 'auto', daily_seasonality='auto', holidays = None, seasonality_prior_scale = 10.0, \
        holidays_prior_scale = 10.0, changepoint_prior_scale = 0.05, mcmc_samples = 0, interval_width = 0.80, uncertainty_samples = 0, transform = None):
    """
        Parameters
        ----------------
             
        y - dataframe of time-series data
        
            	Layout:
                     1st Col - Time instances
                     2nd Col - Total of TS
             
        m - (int) frequency of time series eg. weekly is 52 (len(y) > 2*m)
            
        h - (int) the forecast horizon for the time series
        
        comb – (String)  the type of hierarchical forecasting method that the user wants to use. 
                        
        	Options:
                    "OLS" - optimal combination by ordinary least squares (Default), 
                    "WLSS" - optimal combination by structurally weighted least squares,
                    "WLSV" - optimal combination by variance weighted least squares
                    "BU" - bottom up combination
        
        aggList - (list) The factors that the user would like to consider for ex. m = 52, aggList = [1, 52] 
        
        include_history - (Boolean) input for the forecasting function of Prophet
                        
        cap - (Dataframe or Constant) carrying capacity of the input time series.  If it is a dataframe, then
         the number of columns must equal len(y.columns) - 1
        
        capF - (Dataframe or Constant) carrying capacity of the future time series.  If it is a dataframe, then
         the number of columns must equal len(y.columns) - 1
             
        changepoints - (DataFrame or List) changepoints for the model to consider fitting. If it is a dataframe, then
         the number of columns must equal len(y.columns) - 1
         
        n_changepoints - (constant or list) changepoints for the model to consider fitting. If it is a list, then
         the number of items must equal len(y.columns) - 1
          
        transform - (None or "BoxCox") Do you want to transform your data before fitting the prophet function? If yes, type "BoxCox"
        
        All other inputs - see Prophet
        
        Returns
        -----------------
         
        newDict - a dictionary of DataFrames with predictions, seasonalities and trends that can all be plotted
        
    """
    ##
    # Error Catching
    ##
    if not isinstance(y.iloc[:,0], pd.DatetimeIndex):
        y.iloc[:,0] = pd.DatetimeIndex(y.iloc[:,0])
    if m <= 1:
        sys.exit("Seasonal period (m) must be greater than 1")
    if len(y) < 2*m:
        sys.exit("Need at least 2 periods of data")
    if aggList is not None:
        if 1 not in aggList or m not in aggList:
            sys.exit("1 and the seasonal period must be included in the aggList input")
    ##
    # Compute Aggregate Time Series and return a dictionary of dataframes
    ##
    aggs = aggHier(y, m, aggList)
    ##
    # Transform Variables
    ##
    if transform is not None:
        if transform == 'BoxCox':
            import warnings
            warnings.simplefilter("error", RuntimeWarning)
            boxcoxT = [None]*(len(aggs.keys()))
            try:
                i = 0
                placeHold = []
                for key in sorted(aggs.keys()):
                    placeHold.append(aggs[key].copy())
                    placeHold[i].iloc[:, 1], boxcoxT[i] = boxcox(placeHold[i].iloc[:, 1])
                    i += 1
                i = 0
                for key in sorted(aggs.keys()):
                    aggs[key] = placeHold[i]
                    i += 1
            ##
            # Does a Natural Log Transform if scipy's boxcox cant deal
            ##
            except RuntimeWarning:
                print("It looks like scipy's boxcox function couldn't deal with your data. Proceeding with Natural Log Transform")
                i = 0
                for key in sorted(aggs.keys()):
                    aggs[key].iloc[:, 1] = boxcox(aggs[key].iloc[:, 1], lmbda = 0)
                    boxcoxT[i] = 0
                    i += 1
        else:
            print("Nothing will be transformed because the input was not = to 'BoxCox'")
    else:
        boxcoxT = None
    ##
    # Forecast and Reconcile
    ##
    with contextlib.redirect_stdout(open(os.devnull, "w")):
        forecastsDict, mse, resids = fitProphet(aggs, h, include_history, cap, capF, changepoints, n_changepoints, \
                                                 yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, \
                                                 holidays_prior_scale, changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples)
    newDict = reconcile(forecastsDict, h, mse, resids, comb, boxcoxT)

    return newDict
Exemplo n.º 59
0
sns.distplot(df_fixed['Fare'][:train_num])
plt.show()

df_fixed = MMEncoder.fit_transform(df_fixed)
train_X = df_fixed[:train_num]
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

"""# 作業2
* 最後的 boxcox 區塊直接執行會造成錯誤, 起因為輸入值有負值, 請問如何修正後可以使用 boxcox? (Hint : 試圖修正資料)
"""

# 將 Fare 取 boxcox 後, 看散佈圖, 並計算分數 (執行會有 error, 請試圖修正)
from scipy import stats
df_fixed = copy.deepcopy(df)

"""
df_fixed['LotArea'] = stats.boxcox(df_fixed['LotArea'], lmbda=0.15)
"""

df_fixed['Pclass'] = stats.boxcox(df_fixed['Pclass'], lmbda=0.15)

sns.distplot(df_fixed['Pclass'][:train_num])
plt.show()

df_fixed = MMEncoder.fit_transform(df_fixed)
train_X = df_fixed[:train_num]
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

Exemplo n.º 60
0
import numpy as np
from scipy import stats as sts
#算skewness
#skewValues = sts.skew(cell_num)
print(sts.skew(cell_num)) # numpy.ndarray
type(sts.skew(cell_num))

skewValues = cell_num.apply(sts.skew, axis=0) # pandas.Series
print(skewValues)

### Box-Cox Transformation
# 先試AreaCh1前六筆(只接受一維陣列,自動估計lambda)
from scipy import stats
print(cell['AreaCh1'].head(6))
stats.boxcox(cell['AreaCh1'].head(6))

# stats.boxcox()輸出為兩元素,BC轉換後的AreaCh1與lambda估計值,行成的值組
type(stats.boxcox(cell['AreaCh1'].head(6))) # tuple

# 分別取出BC轉換後的AreaCh1與lambda估計值
stats.boxcox(cell_num['AreaCh1'])[0]
stats.boxcox(cell_num['AreaCh1'])[1]
help(stats.boxcox)

# 補充:另一種Box-Cox公式(可傳入二維陣列,但是要給lambda)
from scipy.special import boxcox1p
lam = 0.16
cell_num_bc = boxcox1p(cell_num, lam)
cell_num_bc