def _initialize(cls): y, x = cls.y, cls.x offset = -0.25 * np.ones(len(y)) # also check offset cov_type = 'HC0' modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial(), offset=offset) cls.res2 = modp.fit(cov_type=cov_type, method='newton', maxiter=1000, disp=0) mod = GLMPenalized(y, x, family=family.Binomial(), offset=offset, penal=cls.penalty) mod.pen_weight *= 1 # lower than in other cases mod.penal.tau = 0.05 cls.res1 = mod.fit(cov_type=cov_type, method='bfgs', max_start_irls=0, maxiter=100, disp=0, trim=0.001) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 1e-3 cls.k_params = cls.k_nonzero
def _initialize(cls): y, x = cls.y, cls.x modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial()) cls.res2 = modp.fit(disp=0) mod = GLMPenalized(y, x, family=family.Binomial(), penal=cls.penalty) mod.pen_weight *= .5 mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def setup_class(cls): from statsmodels.datasets.star98 import load #from statsmodels.genmod.tests.results.results_glm import Star98 data = load() exog = add_constant(data.exog, prepend=True) offset = np.ones(len(data.endog)) exog_keep = exog[:, :-5] cls.mod2 = GLM(data.endog, exog_keep, family=family.Binomial(), offset=offset) cls.mod1 = GLM(data.endog, exog, family=family.Binomial(), offset=offset) cls.init()
def _initialize(cls): y, x = cls.y, cls.x x = x[:, :4] offset = -0.25 * np.ones(len(y)) # also check offset modp = GLM(y, x, family=family.Binomial(), offset=offset) cls.res2 = modp.fit(method='bfgs', max_start_irls=100) mod = GLMPenalized(y, x, family=family.Binomial(), offset=offset, penal=cls.penalty) mod.pen_weight = 0 cls.res1 = mod.fit(method='bfgs', max_start_irls=3, maxiter=100, disp=0, start_params=cls.res2.params*0.9) cls.atol = 1e-10 cls.k_params = 4
def setup_class(cls): super(TestGAMBinomial, cls).setup_class() #initialize DGP cls.family = family.Binomial() cls.rvs = stats.bernoulli.rvs cls.init()
def __init__(self): super(self.__class__, self).__init__() #initialize DGP self.family = family.Binomial() self.rvs = stats.bernoulli.rvs self.init()
def test_glmlogit_screening(): y, x, idx_nonzero_true, beta = _get_logit_data() nobs = len(y) # test uses screener_kwds = dict(pen_weight=nobs * 0.75, threshold_trim=1e-3, ranking_attr='model.score_factor') xnames_true = ['var%4d' % ii for ii in idx_nonzero_true] xnames_true[0] = 'const' parameters = pd.DataFrame(beta[idx_nonzero_true], index=xnames_true, columns=['true']) xframe_true = pd.DataFrame(x[:, idx_nonzero_true], columns=xnames_true) res_oracle = GLMPenalized(y, xframe_true, family=family.Binomial()).fit() parameters['oracle'] = res_oracle.params #mod_initial = LogitPenalized(y, np.ones(nobs), pen_weight=nobs * 0.5) mod_initial = GLMPenalized(y, np.ones(nobs), family=family.Binomial()) screener = VariableScreening(mod_initial, **screener_kwds) screener.k_max_add = 10 exog_candidates = x[:, 1:] res_screen = screener.screen_exog(exog_candidates, maxiter=30) res_screen.idx_nonzero res_screen.results_final xnames = ['var%4d' % ii for ii in res_screen.idx_nonzero] xnames[0] = 'const' # smoke test res_screen.results_final.summary(xname=xnames) res_screen.results_pen.summary() assert_equal(res_screen.results_final.mle_retvals['converged'], True) ps = pd.Series(res_screen.results_final.params, index=xnames, name='final') # changed the following to allow for some extra params # parameters = parameters.join(ps, how='outer') parameters['final'] = ps assert_allclose(parameters['oracle'], parameters['final'], atol=0.005)
def __init__(self, family_name='normal', link_name='identity', fam_params=None): """Constructor.""" # Store link self.link_name = link_name if self.link_name.lower() == 'logit': self.link = L.logit elif self.link_name.lower() == 'log': self.link = L.log elif self.link_name.lower() == 'identity': self.link = L.identity elif self.link_name.lower() == 'sqrt': self.link = L.sqrt elif self.link_name.lower() == 'probit': self.link = L.probit family_kwargs = {} if self.link_name: family_kwargs['link'] = self.link # Store family self.family_name = family_name if self.family_name.lower() == 'normal': self.family = F.Gaussian(**family_kwargs) def rand(x): return np.random.normal(x, fam_params) elif self.family_name.lower() == 'binomial': self.family = F.Binomial(**family_kwargs) def rand(x): return np.random.binomial(1, x) elif self.family_name.lower() == 'poisson': self.family = F.Poisson(**family_kwargs) def rand(x): return np.random.poisson(x) self.rand = rand self.in_columns = None self.out_columns = None
m.fit(y) x = np.linspace(-2, 2, 50) print(m) y_pred = m.results.predict(d) plt.figure() plt.plot(y, '.') plt.plot(z, 'b-', label='true') plt.plot(y_pred, 'r-', label='AdditiveModel') plt.legend() plt.title('gam.AdditiveModel') if example == 2: print("binomial") f = family.Binomial() b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)]) b.shape = y.shape m = GAM(b, d, family=f) toc = time.time() m.fit(b) tic = time.time() print(tic - toc) if example == 3: print("Poisson") f = family.Poisson() y = y / y.max() * 3 yp = f.link.inverse(y) p = np.asarray([scipy.stats.poisson.rvs(p) for p in f.link.inverse(y)], float)
def test_umap_one(): print('started') df = pd.read_csv(sys.argv[1], dtype={'location':str, 'Result':str}) df=df.drop(df[df.BIRTH_DATETIME=='0'].index) phecodes = pd.read_csv(sys.argv[2], dtype=str) out = sys.argv[3] phe_list=[phe for phe in list(phecodes.PHECODE.unique()) if phe in df] phedf = df.loc[:, phe_list] phedf[phedf>0] = 1 df[phe_list] = phedf print('loaded') #Create embeddings pca = PCA(n_components=50, random_state=42) pc_emb = pca.fit_transform(phedf) ump = umap.UMAP(metric='euclidean', n_components=10, random_state=42) ump_emb = ump.fit_transform(pc_emb) print('embedded') #create df reduced_df = pd.DataFrame(ump_emb, columns = ['UMP-'+str(i+1) for i in range(10)]) reduced_df['CC_STATUS']=df['CC_STATUS'] #Create visualization sns.set() sns.pairplot(reduced_df, hue="CC_STATUS", vars=['UMP-'+str(i+1) for i in range(10)], height=4, markers=['o', 's'], plot_kws=dict(alpha=0.1)) plt.savefig(out) print('graphed') #test components reduced_df['newcc']=0 reduced_df.loc[reduced_df['UMP-2']<-12, 'newcc']=1 df['newcc']=reduced_df['newcc'] print('opening file') out_file = open('files/umap_new_cases_chi_phecode_test_2.csv', 'w') out_file.write('phecode,chi2,p,dof,control_neg,case_neg,control_pos,case_pos\n') #Run univariate tests using this newcc col for phecode in phe_list: #Get count of people positive for this phecode in case case_pos = df.loc[(df.newcc==1) & (df[phecode]==1)].shape[0] #Get negative count in case case_neg = df.loc[(df.newcc==1) & (df[phecode]==0)].shape[0] #Get positive control control_pos = df.loc[(df.newcc==0) & (df[phecode]==1)].shape[0] #Get negative control control_neg = df.loc[(df.newcc==0) & (df[phecode]==0)].shape[0] #Run contingency test if case_pos>0 and case_neg>0 and control_pos>0 and control_neg>0: res=chi2_c([[control_neg, case_neg],[control_pos, case_pos]]) #Write results out_file.write(','.join([phecode,str(res[0]),str(res[1]),str(res[2]),str(control_neg),str(case_neg),str(control_pos),str(case_pos)])) out_file.write('\n') out_file.close() print('ran phecode tests') #Get age df['AGE']= pd.to_datetime(df['BIRTH_DATETIME'].str[:10], format='%Y-%m-%d') df['AGE']=(datetime.datetime.now()-df['AGE']).astype('timedelta64[Y]') #Run same test procedure for covariates, but do regression (?) print('running regression') mod = smf.glm(formula='newcc ~ AGE + UNIQUE_PHECODES + RACE + GENDER + RECORD_LENGTH_DAYS', data=df, family=fam.Binomial()) res = mod.fit() print(res.summary())
def covariate_analysis(): cc_df = pd.read_csv(sys.argv[1]) cc_df = cc_df.drop(cc_df[cc_df.BIRTH_DATETIME=='0'].index) #Compare sex, age, ethnicity, record_length, and most recent event #Get age cc_df['age'] = datetime.datetime.now() - cc_df["BIRTH_DATETIME"].str[:10].apply(dconvert) cc_df['age'] = cc_df['age'].apply(ddays) #Between Case and Control status all_res = smf.glm(formula="CC_STATUS ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=cc_df, family=fam.Binomial()).fit() print("Results for Case/control data:") print(all_res.summary()) norm_df = cc_df.loc[cc_df.CC_STATUS==1] print(cc_df.shape) print(norm_df.shape) norm_df['normality_status'] = norm_df["Result"].apply(binarize_normal) normality_res = smf.glm(formula="normality_status ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=norm_df, family=fam.Binomial()).fit() print("Results for normal/abnormal data:") print(normality_res.summary())