示例#1
0
def spline_poisson(data,column_name,integer=False):
    
    if data.shape[0] < 1:
        column_regression = column_name + '_regression'
        data[column_regression] = data[column_name]
        return data
    else:
        index_origin = data.index.name
        data.index.name = 'init_index'
        data = data.reset_index()
        data['index'] = data.index
        x_spline = data[['index']]
        bs = BSplines(x_spline, df=[4], degree=[3])
        gam_bs = GLMGam.from_formula(f'{column_name} ~ index', data=data[['index', column_name]],
                                    smoother=bs, family=sm.families.Poisson())
        res_bs = gam_bs.fit()
        column_regression = column_name + '_regression'
        if integer:
            data[column_regression] = np.random.poisson(res_bs.predict())
        else:
            data[column_regression] = res_bs.predict()
        data = data.drop(columns='index')
        data = data.set_index('init_index')
        data.index.name = index_origin
        return data
示例#2
0
def pred_monthly_population_gam(df):
    df.drop(columns=["Sex", "Age_group", "Value"], inplace=True)

    df = df.iloc[1].transpose().reset_index()
    df.columns = ["time", "pop"]

    bs = BSplines(df.index.values, df=[12], degree=[3])

    gam = GLMGam.from_formula("pop ~ 1", data=df, smoother=bs)
    res = gam.fit()
    return pd.Series(res.predict(), index=df["time"])
 def fit(self, X, y):
     X, y = self._validate_data(X, y, y_numeric=True)
     
     self.spline = BSplines(
         X, df = [self.df] * self.n_features_in_, 
         degree = [self.degree] * self.n_features_in_, 
         include_intercept = False
     )
     
     gam = GLMGam(
         y, exog = np.ones(X.shape[0]), 
         smoother = self.spline, alpha = self.alpha
     )
     self.gam_predictor = gam.fit()
     
     return self
示例#4
0
    def memorize_chunk(self,
                       x,
                       bs,
                       df=4,
                       degree=3,
                       return_penalty=False,
                       knot_kwds=None):
        assert bs == "bs" or bs == "cc", "Spline basis not defined!"
        if bs == "bs":
            self.s = BSplines(x,
                              df=[df],
                              degree=[degree],
                              include_intercept=True,
                              knot_kwds=None)
        elif bs == "cc":
            self.s = CyclicCubicSplines(x, df=[df])

        self.penalty_matrices = self.s.penalty_matrices
示例#5
0
 def fit(self, X: pd.DataFrame, y: pd.Series):
     if len(self._smooth_names) == 0:
         bs = None
     else:
         X_spline = X[self._smooth_names]
         bs = BSplines(
             X_spline,
             df=self._dfs,
             degree=[self._degree] * len(self._smooth_names),
             knot_kwds=[{
                 "lower_bound":
                 None if self.lower_bound is None else self.lower_bound[i],
                 "upper_bound":
                 None if self.upper_bound is None else self.upper_bound[i]
             } for i in range(len(self._smooth_names))])
     self._gam_bs = GLMGam(y,
                           X.iloc[:, ~X.columns.isin(self._smooth_names)],
                           smoother=bs,
                           alpha=self._alphas,
                           family=self._family)
     self._res_bs = self._gam_bs.fit()
     return self
示例#6
0
def td_prob(rush_att: pd.DataFrame) -> Tuple[GLMResults, BSplines]:
    """
    TD probability as a function of distance to goal
    """
    rush_att["inv_yards"] = 1 / rush_att["yardline_100"]
    train_params = ["inv_yards"]
    # train_params = ["yardline_100"]
    y_data_train = rush_att["rush_touchdown"]
    x_data_train = rush_att[train_params]
    # TODO: These should be checked by X-val; this situation is more complicated than FGs
    degree, df, alpha = 3, 4, 0.0
    bs = BSplines(x_data_train, df=[df], degree=[degree])
    model = sm.GLMGam(
        y_data_train,
        sm.add_constant(x_data_train[[]]),
        smoother=bs,
        alpha=alpha,
        family=sm.families.Binomial(),
    )
    fit = model.fit()
    print(fit.summary2())
    return fit, bs
示例#7
0
 def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series,
                                                            np.ndarray]):
     assert not (self.splines == "cyclic_cubic") or (self.degree == 3)
     df = self.df if isinstance(self.df, list) else [self.df] * X.shape[1]
     degree = self.degree if isinstance(
         self.degree, list) else [self.degree] * X.shape[1]
     alpha = self.alpha if isinstance(self.alpha,
                                      list) else [self.alpha] * X.shape[1]
     if self.splines == "cyclic_cubic":
         self.splines_ = CyclicCubicSplines(X, df=df)
     elif self.splines == "b":
         self.splines_ = BSplines(X, df=df, degree=degree)
     else:
         raise ValueError(self.splines)
     self.x_min_ = np.min(X, axis=0)
     self.x_max_ = np.max(X, axis=0)
     self.estimator_ = GLMGam(y,
                              X,
                              smoother=self.splines_,
                              family=self.family,
                              alpha=alpha)
     self.res_ = self.estimator_.fit()
     return self
示例#8
0
def basic_gam(data: pd.DataFrame, save_pred: bool = False) -> GLMResults:
    """
    Regression of success probability as a function of kick distance
    """
    # TODO: treat blocked FGs separately
    data["fg_make"] = data["field_goal_result"].map(
        # {"made": True, "missed": False, "blocked": False}
        {
            "made": 1,
            "missed": 0,
            "blocked": 0
        })
    train_params = ["kick_distance"]
    # data_test, data_train = split_test_train(data)
    # x_data_train = data_train[train_params]
    # y_data_train = data_train["fg_make"]
    # x_data_test = data_test[train_params]
    # y_data_test = data_test["fg_make"]
    x_data_train = data[train_params]
    y_data_train = data["fg_make"]

    # These values should be tested by cross-validation
    # degree 3 has slightly better test loss than 2 and isn't noticeably worse than 4
    degree = 3
    # df = 4 has the least test loss but is the minimum required for degree 3
    df = 4
    # alpha > 0 results in increased test loss. select_penweight() can be used to choose
    # in general.
    alpha = 0.0
    bs = BSplines(x_data_train, df=[df], degree=[degree])

    model = sm.GLMGam(
        y_data_train,
        sm.add_constant(x_data_train[[]]),
        smoother=bs,
        alpha=alpha,
        family=sm.families.Binomial(),
    )

    fit = model.fit()

    # test_loss = get_loss(
    #     fit,
    #     sm.add_constant(bs.transform(x_data_test.to_numpy())),
    #     y_data_test
    # )
    df_resid = fit.df_resid
    llf = fit.llf / df_resid
    deviance = fit.deviance / df_resid
    chi_sq_df = fit.pearson_chi2 / df_resid
    print(f"ll / ndf = {llf}")
    print(f"deviance = {deviance}")
    print(f"chi sq / ndf = {chi_sq_df}")
    print(f"AIC = {fit.aic}")

    if save_pred:
        data["fg_make_prob"] = fit.predict(
            sm.add_constant(bs.transform(data[train_params].to_numpy())))

    # defaults to 95% confidence interval (0.05 argument is alpha)
    # print(fit.conf_int(0.1))
    # standard error approximation (95% ~ 2*sigma, double-sided)
    # print(0.25 * (fit.conf_int()[1] - fit.conf_int()[0]))
    # print(dir(sm_model_fit))

    print(f"params = {fit.params}")

    return fit, bs
示例#9
0
def harmonizationLearn(data,
                       covars,
                       eb=True,
                       smooth_terms=[],
                       smooth_term_bounds=(None, None),
                       return_s_data=False,
                       orig_model=None,
                       seed=None):
    """
    Wrapper for neuroCombat function that returns the harmonization model.
    
    Arguments
    ---------
    data : a numpy array
        data to harmonize with ComBat, dimensions are N_samples x N_features
    
    covars : a pandas DataFrame 
        contains covariates to control for during harmonization
        all covariates must be encoded numerically (no categorical variables)
        must contain a single column "SITE" with site labels for ComBat
        dimensions are N_samples x (N_covariates + 1)
        
    eb : bool, default True
        whether to use empirical Bayes estimates of site effects
        
    smooth_terms (Optional) :  a list, default []
        names of columns in covars to include as smooth, nonlinear terms
        can be any or all columns in covars, except "SITE"
        if empty, ComBat is applied with a linear model of covariates
        if not empty, Generalized Additive Models (GAMs) are used
        will increase computation time due to search for optimal smoothing
        
    smooth_term_bounds (Optional) : tuple of float, default (None, None)
        feature to support custom boundaries of the smoothing terms
        useful when holdout data covers different range than 
        specify the bounds as (minimum, maximum)
        currently not supported for models with mutliple smooth terms
        
    return_s_data (Optional) : bool, default False
        whether to return s_data, the standardized data array
        can be useful for diagnostics but will be costly to save/load if large

    seed (Optional) : int, default None
        By default, this function is non-deterministic. Setting the optional
        argument `seed` will make the function deterministic.


    Returns
    -------
    model : a dictionary of estimated model parameters
        design, var_pooled, B_hat, grand_mean,
        gamma_star, delta_star, info_dict (a neuroCombat invention),
        gamma_hat, delta_hat, gamma_bar, t2, a_prior, b_prior, smooth_model
    
    bayes_data : a numpy array
        harmonized data, corrected for effects of SITE
        dimensions are N_samples x N_features

    s_data (Optional) : a numpy array
        standardized residuals after accounting for `covars` other than `SITE`
        set return_s_data=True to output the variable
    
    """
    # set optional random seed
    if seed is not None:
        pass
    else:
        np.random.seed(seed)

    if orig_model is None:
        pass
    else:
        model = orig_model.copy()

    # transpose data as per ComBat convention
    data = data.T
    # prep covariate data
    covar_levels = list(covars.columns)
    batch_labels = np.unique(covars.SITE)
    batch_col = covars.columns.get_loc('SITE')

    if orig_model is None:
        pass
    else:
        isTrainSite = covars['SITE'].isin(model['SITE_labels'])
        isTrainSiteLabel = set(model['SITE_labels'])
        isTrainSiteColumns = np.where((pd.DataFrame(np.unique(
            covars['SITE'])).isin(model['SITE_labels']).values).flat)
        isTrainSiteColumnsOrig = np.where((pd.DataFrame(
            model['SITE_labels']).isin(np.unique(covars['SITE'])).values).flat)
        isTestSiteColumns = np.where((~pd.DataFrame(np.unique(
            covars['SITE'])).isin(model['SITE_labels']).values).flat)

    cat_cols = []
    num_cols = [
        covars.columns.get_loc(c) for c in covars.columns if c != 'SITE'
    ]
    smooth_cols = [
        covars.columns.get_loc(c) for c in covars.columns if c in smooth_terms
    ]
    # maintain a dictionary of smoothing information
    smooth_model = {
        'perform_smoothing': len(smooth_terms) > 0,
        'smooth_terms': smooth_terms,
        'smooth_cols': smooth_cols,
        'bsplines_constructor': None,
        'formula': None,
        'df_gam': None
    }
    covars = np.array(covars, dtype='object')
    ### additional setup code from neuroCombat implementation:
    # convert batch col to integer
    covars[:, batch_col] = np.unique(covars[:, batch_col],
                                     return_inverse=True)[-1]
    # create dictionary that stores batch info
    (batch_levels, sample_per_batch) = np.unique(covars[:, batch_col],
                                                 return_counts=True)
    info_dict = {
        'batch_levels':
        batch_levels.astype('int'),
        'n_batch':
        len(batch_levels),
        'n_sample':
        int(covars.shape[0]),
        'sample_per_batch':
        sample_per_batch.astype('int'),
        'batch_info': [
            list(np.where(covars[:, batch_col] == idx)[0])
            for idx in batch_levels
        ]
    }
    ###
    design = make_design_matrix(covars, batch_col, cat_cols, num_cols)

    ### additional setup if smoothing is performed
    if smooth_model['perform_smoothing']:
        # create cubic spline basis for smooth terms
        X_spline = covars[:, smooth_cols].astype(float)
        if orig_model is None:
            bs = BSplines(X_spline,
                          df=[10] * len(smooth_cols),
                          degree=[3] * len(smooth_cols),
                          knot_kwds=[{
                              'lower_bound': smooth_term_bounds[0],
                              'upper_bound': smooth_term_bounds[1]
                          }])
            # construct formula and dataframe required for gam
            formula = 'y ~ '
            df_gam = {}
            for b in batch_levels:
                formula = formula + 'x' + str(b) + ' + '
                df_gam['x' + str(b)] = design[:, b]
            for c in num_cols:
                if c not in smooth_cols:
                    formula = formula + 'c' + str(c) + ' + '
                    df_gam['c' + str(c)] = covars[:, c].astype(float)
            formula = formula[:-2] + '- 1'
            df_gam = pd.DataFrame(df_gam)
            # for matrix operations, a modified design matrix is required
            design = np.concatenate((df_gam, bs.basis), axis=1)
            # store objects in dictionary
            smooth_model['bsplines_constructor'] = bs
            smooth_model['formula'] = formula
            smooth_model['df_gam'] = df_gam
        else:
            bs_basis = model['smooth_model']['bsplines_constructor'].transform(
                X_spline)
            # construct formula and dataframe required for gam
            formula = 'y ~ '
            df_gam = {}
            for b in batch_levels:
                formula = formula + 'x' + str(b) + ' + '
                df_gam['x' + str(b)] = design[:, b]
            for c in num_cols:
                if c not in smooth_cols:
                    formula = formula + 'c' + str(c) + ' + '
                    df_gam['c' + str(c)] = covars[:, c].astype(float)
            formula = formula[:-2] + '- 1'
            df_gam = pd.DataFrame(df_gam)
            # for matrix operations, a modified design matrix is required
            design = np.concatenate((df_gam, bs_basis), axis=1)
    ###
    # run steps to perform ComBat
    if orig_model is None:
        s_data, stand_mean, var_pooled, B_hat, grand_mean = standardizeAcrossFeatures(
            data, design, info_dict, smooth_model)
        LS_dict = fitLSModelAndFindPriors(s_data, design, info_dict, eb=eb)
        # optional: avoid EB estimates
        if eb:
            gamma_star, delta_star = find_parametric_adjustments(
                s_data, LS_dict, info_dict)
        else:
            gamma_star = LS_dict['gamma_hat']
            delta_star = np.array(LS_dict['delta_hat'])
        bayes_data = adjust_data_final(s_data, design, gamma_star, delta_star,
                                       stand_mean, var_pooled, info_dict)
        # save model parameters in single object
        model = {
            'design': design,
            'SITE_labels': batch_labels,
            'var_pooled': var_pooled,
            'B_hat': B_hat,
            'grand_mean': grand_mean,
            'gamma_star': gamma_star,
            'delta_star': delta_star,
            'info_dict': info_dict,
            'gamma_hat': LS_dict['gamma_hat'],
            'delta_hat': np.array(LS_dict['delta_hat']),
            'gamma_bar': LS_dict['gamma_bar'],
            't2': LS_dict['t2'],
            'a_prior': LS_dict['a_prior'],
            'b_prior': LS_dict['b_prior'],
            'smooth_model': smooth_model,
            'eb': eb,
            'SITE_labels_train': batch_labels,
            'Covariates': covar_levels
        }
        # transpose data to return to original shape
        bayes_data = bayes_data.T
    else:
        # Create train data
        (batch_levels, sample_per_batch) = np.unique(covars[isTrainSite,
                                                            batch_col],
                                                     return_counts=True)
        if batch_levels.size == 0:
            bayes_data_train = np.zeros(shape=(0, data.shape[0]))
            s_data_train = np.zeros(shape=(0, data.shape[0])).T
        else:
            info_dict_train = model['info_dict'].copy()
            info_dict_train['sample_per_batch'] = sample_per_batch.astype(
                'int')
            info_dict_train['batch_info'] = [
                list(np.where(covars[isTrainSite, batch_col] == idx)[0])
                for idx in batch_levels
            ]
            tmp = np.concatenate((np.zeros(shape=(info_dict['n_sample'],
                                                  len(model['SITE_labels']))),
                                  design[:, len(batch_labels):]),
                                 axis=1)
            s_data_train, stand_mean_train, _ = applyStandardizationAcrossFeatures(
                data[:, isTrainSite], tmp[isTrainSite, :], info_dict_train,
                model)
            design2 = tmp.copy()
            design2[:,
                    isTrainSiteColumnsOrig[0]] = design[:,
                                                        isTrainSiteColumns[0]]
            bayes_data_train = adjust_data_final(
                s_data_train, design2[isTrainSite, :], model['gamma_star'],
                model['delta_star'], stand_mean_train, model['var_pooled'],
                info_dict_train)
            # transpose data to return to original shape
            bayes_data_train = bayes_data_train.T

        # Create test data (new SITE)
        (batch_levels, sample_per_batch) = np.unique(covars[~isTrainSite,
                                                            batch_col],
                                                     return_counts=True)
        if batch_levels.size == 0:
            bayes_data_test = np.zeros(shape=(0, data.shape[0]))
            s_data_test = np.zeros(shape=(0, data.shape[0])).T
        else:
            info_dict_test = {
                'batch_levels':
                batch_levels.astype('int'),
                'n_batch':
                len(batch_levels),
                'n_sample':
                int(covars[~isTrainSite, :].shape[0]),
                'sample_per_batch':
                sample_per_batch.astype('int'),
                'batch_info': [
                    list(np.where(covars[~isTrainSite, batch_col] == idx)[0])
                    for idx in batch_levels
                ]
            }
            design_tmp = np.concatenate(
                (design[:, isTestSiteColumns[0]], design[:,
                                                         len(batch_labels):]),
                axis=1)
            s_data_test, stand_mean_test, _ = applyStandardizationAcrossFeatures(
                data[:, ~isTrainSite], design_tmp[~isTrainSite, :],
                info_dict_test, model)
            LS_dict = fitLSModelAndFindPriors(s_data_test,
                                              design_tmp[~isTrainSite, :],
                                              info_dict_test,
                                              eb=eb)
            if eb:
                gamma_star, delta_star = find_parametric_adjustments(
                    s_data_test, LS_dict, info_dict_test)
            else:
                gamma_star = LS_dict['gamma_hat']
                delta_star = np.array(LS_dict['delta_hat'])
            betas = []
            for i in range(info_dict_test['n_batch']):
                diff_mean = np.mean(
                    data[:, info_dict_test['batch_info'][i]] - np.dot(
                        design[info_dict_test['batch_info'][i],
                               info_dict['n_batch']:],
                        model['B_hat'][len(model['SITE_labels']):, :]).T,
                    axis=1)
                betas.append(diff_mean)
            new_betas = np.array(betas)
            model['B_hat'] = np.concatenate(
                (model['B_hat'][:len(model['SITE_labels']), :], new_betas,
                 model['B_hat'][len(model['SITE_labels']):, :]))
            model['SITE_labels'] = np.append(
                model['SITE_labels'],
                list(set(batch_labels) - isTrainSiteLabel))
            model['gamma_star'] = np.append(model['gamma_star'],
                                            gamma_star,
                                            axis=0)
            model['delta_star'] = np.append(model['delta_star'],
                                            delta_star,
                                            axis=0)
            model['info_dict']['n_batch'] = len(model['SITE_labels'])
            bayes_data_test = adjust_data_final(s_data_test,
                                                design_tmp[~isTrainSite, :],
                                                gamma_star, delta_star,
                                                stand_mean_test,
                                                model['var_pooled'],
                                                info_dict_test)
            # transpose data to return to original shape
            bayes_data_test = bayes_data_test.T
        bayes_data = np.zeros(shape=data.T.shape)
        bayes_data[isTrainSite, :] = bayes_data_train
        bayes_data[~isTrainSite, :] = bayes_data_test
        s_data = np.zeros(shape=data.T.shape)
        s_data[isTrainSite, :] = s_data_train.T
        s_data[~isTrainSite, :] = s_data_test.T

    if return_s_data:
        return model, bayes_data, s_data.T
    else:
        return model, bayes_data
示例#10
0
def harmonizationLearn(data,
                       covars,
                       eb=True,
                       smooth_terms=[],
                       smooth_term_bounds=(None, None),
                       return_s_data=False):
    """
    Wrapper for neuroCombat function that returns the harmonization model.
    
    Arguments
    ---------
    data : a numpy array
        data to harmonize with ComBat, dimensions are N_samples x N_features
    
    covars : a pandas DataFrame 
        contains covariates to control for during harmonization
        all covariates must be encoded numerically (no categorical variables)
        must contain a single column "SITE" with site labels for ComBat
        dimensions are N_samples x (N_covariates + 1)
        
    eb : bool, default True
        whether to use empirical Bayes estimates of site effects
        
    smooth_terms (Optional) :  a list, default []
        names of columns in covars to include as smooth, nonlinear terms
        can be any or all columns in covars, except "SITE"
        if empty, ComBat is applied with a linear model of covariates
        if not empty, Generalized Additive Models (GAMs) are used
        will increase computation time due to search for optimal smoothing
        
    smooth_term_bounds (Optional) : tuple of float, default (None, None)
        feature to support custom boundaries of the smoothing terms
        useful when holdout data covers different range than 
        specify the bounds as (minimum, maximum)
        currently not supported for models with mutliple smooth terms
        
    return_s_data (Optional) : bool, default False
        whether to return s_data, the standardized data array
        can be useful for diagnostics but will be costly to save/load if large
        
    Returns
    -------
    model : a dictionary of estimated model parameters
        design, var_pooled, B_hat, grand_mean,
        gamma_star, delta_star, info_dict (a neuroCombat invention),
        gamma_hat, delta_hat, gamma_bar, t2, a_prior, b_prior, smooth_model
    
    bayes_data : a numpy array
        harmonized data, dimensions are N_samples x N_features
        
    s_data (Optional) : a numpy array
        if return_s_data=True
    
    """
    # transpose data as per ComBat convention
    data = data.T
    # prep covariate data
    batch_col = covars.columns.get_loc('SITE')
    cat_cols = []
    num_cols = [
        covars.columns.get_loc(c) for c in covars.columns if c != 'SITE'
    ]
    smooth_cols = [
        covars.columns.get_loc(c) for c in covars.columns if c in smooth_terms
    ]
    # maintain a dictionary of smoothing information
    smooth_model = {
        'perform_smoothing': len(smooth_terms) > 0,
        'smooth_terms': smooth_terms,
        'smooth_cols': smooth_cols,
        'bsplines_constructor': None,
        'formula': None,
        'df_gam': None
    }
    covars = np.array(covars, dtype='object')
    ### additional setup code from neuroCombat implementation:
    # convert batch col to integer
    covars[:, batch_col] = np.unique(covars[:, batch_col],
                                     return_inverse=True)[-1]
    # create dictionary that stores batch info
    (batch_levels, sample_per_batch) = np.unique(covars[:, batch_col],
                                                 return_counts=True)
    info_dict = {
        'batch_levels':
        batch_levels.astype('int'),
        'n_batch':
        len(batch_levels),
        'n_sample':
        int(covars.shape[0]),
        'sample_per_batch':
        sample_per_batch.astype('int'),
        'batch_info': [
            list(np.where(covars[:, batch_col] == idx)[0])
            for idx in batch_levels
        ]
    }
    ###
    design = make_design_matrix(covars, batch_col, cat_cols, num_cols)
    ### additional setup if smoothing is performed
    if smooth_model['perform_smoothing']:
        # create cubic spline basis for smooth terms
        X_spline = covars[:, smooth_cols].astype(float)
        bs = BSplines(X_spline,
                      df=[10] * len(smooth_cols),
                      degree=[3] * len(smooth_cols),
                      knot_kwds=[{
                          'lower_bound': smooth_term_bounds[0],
                          'upper_bound': smooth_term_bounds[1]
                      }])
        # construct formula and dataframe required for gam
        formula = 'y ~ '
        df_gam = {}
        for b in batch_levels:
            formula = formula + 'x' + str(b) + ' + '
            df_gam['x' + str(b)] = design[:, b]
        for c in num_cols:
            if c not in smooth_cols:
                formula = formula + 'c' + str(c) + ' + '
                df_gam['c' + str(c)] = covars[:, c].astype(float)
        formula = formula[:-2] + '- 1'
        df_gam = pd.DataFrame(df_gam)
        # for matrix operations, a modified design matrix is required
        design = np.concatenate((df_gam, bs.basis), axis=1)
        # store objects in dictionary
        smooth_model['bsplines_constructor'] = bs
        smooth_model['formula'] = formula
        smooth_model['df_gam'] = df_gam
    ###
    # run steps to perform ComBat
    s_data, stand_mean, var_pooled, B_hat, grand_mean = StandardizeAcrossFeatures(
        data, design, info_dict, smooth_model)
    LS_dict = fit_LS_model_and_find_priors(s_data, design, info_dict)
    # optional: avoid EB estimates
    if eb:
        gamma_star, delta_star = find_parametric_adjustments(
            s_data, LS_dict, info_dict)
    else:
        gamma_star = LS_dict['gamma_hat']
        delta_star = np.array(LS_dict['delta_hat'])
    bayes_data = adjust_data_final(s_data, design, gamma_star, delta_star,
                                   stand_mean, var_pooled, info_dict)
    # save model parameters in single object
    model = {
        'design': design,
        'var_pooled': var_pooled,
        'B_hat': B_hat,
        'grand_mean': grand_mean,
        'gamma_star': gamma_star,
        'delta_star': delta_star,
        'info_dict': info_dict,
        'gamma_hat': LS_dict['gamma_hat'],
        'delta_hat': np.array(LS_dict['delta_hat']),
        'gamma_bar': LS_dict['gamma_bar'],
        't2': LS_dict['t2'],
        'a_prior': LS_dict['a_prior'],
        'b_prior': LS_dict['b_prior'],
        'smooth_model': smooth_model,
        'eb': eb
    }
    # transpose data to return to original shape
    bayes_data = bayes_data.T

    if return_s_data:
        return model, bayes_data, s_data
    else:
        return model, bayes_data
示例#11
0
def GAM_pt(pse_t,
           expr,
           smooth='BSplines',
           df=5,
           degree=3,
           family=sm.families.NegativeBinomial()):
    """\
    Fit a Generalized Additive Model with the exog to be the pseudo-time. The likelihood ratio test is performed 
    to test the significance of pseudo-time in affecting gene expression value

    Parameters
    ----------
    pse_t
        pseudo-time
    expr
        expression value
    smooth
        choose between BSplines and CyclicCubicSplines
    df
        number of basis function, or degree of freedom
    degree
        degree of the spline function
    family
        distribution family to choose, default is negative binomial.

    Returns
    -------
    y_full
        predict regressed value with full model
    y_reduced
        predict regressed value from null hypothesis
    lr_pvalue
        p-value
    """
    from statsmodels.gam.api import GLMGam, BSplines, CyclicCubicSplines

    if smooth == 'BSplines':
        spline = BSplines(pse_t, df=[df], degree=[degree])
    elif smooth == 'CyclicCubicSplines':
        spline = CyclicCubicSplines(pse_t, df=[df])

    exog, endog = sm.add_constant(pse_t), expr
    # calculate full model
    model_full = sm.GLMGam(endog=endog,
                           exog=exog,
                           smoother=spline,
                           family=family)
    try:
        res_full = model_full.fit()
    except:
        # print("The gene expression is mostly zero")
        return None, None, None
    else:
        # default is exog
        y_full = res_full.predict()
        # reduced model
        y_reduced = res_full.null

        # number of samples - number of paras (res_full.df_resid)
        df_full_residual = expr.shape[0] - df
        df_reduced_residual = expr.shape[0] - 1

        # likelihood of full model
        llf_full = res_full.llf
        # likelihood of reduced(null) model
        llf_reduced = res_full.llnull

        lrdf = (df_reduced_residual - df_full_residual)
        lrstat = -2 * (llf_reduced - llf_full)
        lr_pvalue = stats.chi2.sf(lrstat, df=lrdf)
        return y_full, y_reduced, lr_pvalue
示例#12
0
import statsmodels.api as sm
import numpy as np
from statsmodels.gam.api import GLMGam, BSplines
from statsmodels.gam.tests.test_penalized import df_autos
x_spline = df_autos[['weight', 'hp']]
bs = BSplines(x_spline, df=[12, 10], degree=[3, 3])
alpha = np.array([21833888.8, 6460.38479])
gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive',
                             data=df_autos,
                             smoother=bs,
                             alpha=alpha)

res_bs = gam_bs.fit()
print(res_bs.summary())
res_bs.plot_partial(0, cpr=True)
res_bs.plot_partial(1, cpr=True)
alpha = np.array([8283989284.5829611, 14628207.58927821])
gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive',
                             data=df_autos,
                             smoother=bs,
                             alpha=alpha,
                             family=sm.families.Poisson())

res_bs = gam_bs.fit()
print(res_bs.summary())
gam_bs.select_penweight()[0]
gam_bs.select_penweight_kfold()[0]
示例#13
0
plt.plot(x, p(x), 'k-')

#

from patsy import bs
kts = [0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 1]
z = sm.OLS(y, bs(x, knots=kts, include_intercept=True)).fit()
plt.scatter(x, y)
plt.plot(x, z.fittedvalues)

# ## Additive Models
#

from statsmodels.gam.api import GLMGam, BSplines
xmat = ethanol[['C', 'E']]
bs = BSplines(xmat, df=[4, 4], degree=[3, 3])
gamod = GLMGam.from_formula('NOx ~ C + E', ethanol, smoother=bs).fit()

#

fig = gamod.plot_partial(0, cpr=True)

#

fig = gamod.plot_partial(1, cpr=True)

# ## More Complex Models
# ## Exercises

# ## Packages Used