def setup_class(cls): data = ds.df data_unordered = ds.df_unordered # a Scipy distribution defined minimally class CLogLog(stats.rv_continuous): def _ppf(self, q): return np.log(-np.log(1 - q)) def _cdf(self, x): return 1 - np.exp(-np.exp(x)) cloglog = CLogLog() mod = OrderedModel(data['apply'].values.codes, np.asarray(data[['pared', 'public', 'gpa']], float), distr=cloglog) res = mod.fit(method='bfgs', disp=False) modp = OrderedModel(data['apply'], data[['pared', 'public', 'gpa']], distr=cloglog) resp = modp.fit(method='bfgs', disp=False) # with pytest.warns(UserWarning): modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", data={ "apply": data['apply'].values.codes, "pared": data['pared'], "public": data['public'], "gpa": data['gpa'] }, distr=cloglog) resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel(data_unordered['apply'].values.codes, np.asarray( data_unordered[['pared', 'public', 'gpa']], float), distr=cloglog) resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_cloglog as res2 cls.res2 = res2 cls.res1 = res cls.resp = resp cls.resf = resf cls.resu = resu
def setup_class(cls): data = ds.df data_unordered = ds.df_unordered mod = OrderedModel(data['apply'].values.codes, np.asarray(data[['pared', 'public', 'gpa']], float), distr='probit') res = mod.fit(method='bfgs', disp=False) modp = OrderedModel(data['apply'], data[['pared', 'public', 'gpa']], distr='probit') resp = modp.fit(method='bfgs', disp=False) modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", data={ "apply": data['apply'].values.codes, "pared": data['pared'], "public": data['public'], "gpa": data['gpa'] }, distr='probit') resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel(data_unordered['apply'].values.codes, np.asarray( data_unordered[['pared', 'public', 'gpa']], float), distr='probit') resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_probit as res2 cls.res2 = res2 cls.res1 = res cls.resp = resp cls.resf = resf cls.resu = resu # regression numbers cls.pred_table = np.array( [ [202, 18, 0, 220], [112, 28, 0, 140], [27, 13, 0, 40], # noqa [341, 59, 0, 400] ], dtype=np.int64)
def setup_class(cls): data = ds.df nobs = len(data) data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float) # alias to correspond to patsy name data["C(dummy)[T.1.0]"] = data["dummy"] cls.data = data columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa'] # standard fit mod = OrderedModel(data['apply'].values.codes, np.asarray(data[columns], float), distr='logit') cls.res = mod.fit(method='bfgs', disp=False) # standard fit with pandas input modp = OrderedModel(data['apply'], data[columns], distr='logit') cls.resp = modp.fit(method='bfgs', disp=False)
def setup_class(cls): data = ds.df data_unordered = ds.df_unordered # standard fit mod = OrderedModel(data['apply'].values.codes, np.asarray(data[['pared', 'public', 'gpa']], float), distr='logit') res = mod.fit(method='bfgs', disp=False) # standard fit with pandas input modp = OrderedModel(data['apply'], data[['pared', 'public', 'gpa']], distr='logit') resp = modp.fit(method='bfgs', disp=False) # fit with formula modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", data={ "apply": data['apply'].values.codes, "pared": data['pared'], "public": data['public'], "gpa": data['gpa'] }, distr='logit') resf = modf.fit(method='bfgs', disp=False) # fit on data with ordered=False modu = OrderedModel(data_unordered['apply'].values.codes, np.asarray( data_unordered[['pared', 'public', 'gpa']], float), distr='logit') resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_logit as res2 cls.res2 = res2 cls.res1 = res cls.resp = resp cls.resf = resf cls.resu = resu
def test_nan_endog_exceptions(): nobs = 15 y = np.repeat(np.arange(3), nobs // 3) x = np.column_stack((np.ones(nobs), np.arange(nobs))) with pytest.raises(ValueError, match="not be a constant"): OrderedModel(y, x, distr='logit') y_nan = y.astype(float) y_nan[0] = np.nan with pytest.raises(ValueError, match="NaN in dependent variable"): OrderedModel(y_nan, x[:, 1:], distr='logit') if hasattr(pd, "CategoricalDtype"): df = pd.DataFrame({ "endog": pd.Series( y, dtype=pd.CategoricalDtype([1, 2, 3], ordered=True)), "exog": x[:, 1] }) msg = "missing values in categorical endog" with pytest.raises(ValueError, match=msg): OrderedModel(df["endog"], df[["exog"]])
def test_attributes(self): data = ds.df mask_drop = data['apply'] == "somewhat likely" data2 = data.loc[~mask_drop, :].copy() # we need to remove the category also from the Categorical Index data2['apply'] = data2['apply'].cat.remove_categories( "somewhat likely") # standard fit with pandas input modp = OrderedModel(data2['apply'], data2[['pared', 'public', 'gpa']], distr='logit') resp = modp.fit(method='bfgs', disp=False) exog = add_constant(data2[['pared', 'public', 'gpa']], prepend=False) mod_logit = Logit(data2['apply'].cat.codes, exog) res_logit = mod_logit.fit() attributes = "bse df_resid llf aic bic llnull".split() attributes += "llnull llr llr_pvalue prsquared".split() assert_allclose(resp.params[:3], res_logit.params[:3], rtol=1e-5) assert_allclose(resp.params[3], -res_logit.params[3], rtol=1e-5) for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4) resp = modp.fit(method='bfgs', disp=False, cov_type="hac", cov_kwds={"maxlags": 2}) res_logit = mod_logit.fit(method='bfgs', disp=False, cov_type="hac", cov_kwds={"maxlags": 2}) for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4) resp = modp.fit(method='bfgs', disp=False, cov_type="hc1") res_logit = mod_logit.fit(method='bfgs', disp=False, cov_type="hc1") for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4)
def test_loglikerelated(self): res1 = self.res1 # res2 = self.res2 mod = res1.model fact = 1.1 # evaluate away from optimum score1 = mod.score(res1.params * fact) score_obs_numdiff = mod.score_obs(res1.params * fact) score_obs_exog = mod.score_obs_(res1.params * fact) assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7) assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7) # null model mod_null = OrderedModel(mod.endog, None, offset=np.zeros(mod.nobs), distr=mod.distr) null_params = mod.start_params res_null = mod_null.fit(method='bfgs', disp=False) assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8) assert_allclose(res1.llnull, res_null.llf, rtol=1e-8)
def main(): ### ### input paths ### input_df_path = sys.argv[ 1] # dataframe with the combined questionnaire data input_question_overview = sys.argv[ 2] # datafrane with the question ids over time input_pgs_path = sys.argv[ 3] # dataframe with the PGS values of the participants output_dir_ori_path = sys.argv[4] # output directory path suffix = sys.argv[5] # suffix fot the outputs pgs_id = int( sys.argv[7] ) # int of pgs number, used for multi node analysis on cluster question_prs_selection_path = sys.argv[ 6] # input path of question and prs selection selected_trait_file_path = sys.argv[ 7] # input path of trait selection file model_selection_file = sys.argv[8] # input path of trait selection file df_question_prs_selection = pd.read_csv(question_prs_selection_path, sep="\t") # create output directory create_dir(output_dir_ori_path) # create log file log_file_path = "log_file_{id}_{suffix}_{date}.txt".format( id=pgs_id, suffix=suffix, date=datetime.now().strftime("%d-%m-%Y")) logfile = open(os.path.join(output_dir_ori_path, log_file_path), "w") # read input files df_quest = pd.read_pickle(input_df_path) df_question_ids_total = pd.read_csv(input_question_overview, sep="\t", index_col=0, dtype="str") df_question_ids = df_question_ids_total.loc[:, df_question_ids_total.columns. difference([ "Number of timepoints", "Question answers" ])] df_question_ids.columns = df_question_ids.columns.astype(float) df_question_ids = df_question_ids.T df_question_ids = df_question_ids.sort_index() trait_subset = pd.read_pickle(input_pgs_path) # Filter questionnaire question data on meta information start_columns = df_quest.columns[df_quest.columns.str.startswith("covt")] date_columns = df_quest.columns[df_quest.columns.str.endswith("DATE")] age_columns = df_quest.columns[df_quest.columns.str.endswith("AGE")] gender_columns = df_quest.columns[df_quest.columns.str.endswith("GENDER")] date_variant_id = df_quest.columns[df_quest.columns.str.endswith( "VARIANT_ID")] date_zip_code = df_quest.columns[df_quest.columns.str.endswith("ZIP_CODE")] date_response_rate = df_quest.columns[df_quest.columns.str.contains( "responsedate")] skip_columns = [ "covt17_COVID172TXT", "covt17_COVID177TXT", "covt17_COVID192A", "covt17_COVID192B", "covt17_PSEUDOIDEXT" ] selection_columns = start_columns.difference(date_columns).difference( age_columns).difference(gender_columns).difference( date_variant_id).difference(date_zip_code).difference( date_response_rate).difference(skip_columns) df_quest = df_quest.loc[df_quest.index.intersection(trait_subset.index), :] # trait selection trait_subset.columns = trait_subset.columns.str.replace( "/", ".").str.replace(" ", ".").str.replace("-", ".").str.replace( "(", ".").str.replace(")", ".") df_selected_traits = pd.read_csv(selected_trait_file_path, header=None) trait_subset = trait_subset.loc[:, trait_subset.columns. intersection(df_selected_traits.iloc[:, 0])] trait_subset = trait_subset.iloc[:, [pgs_id]] print("Process trade: {}".format(trait_subset.columns[0])) # model covariate columns correction_columns = [ "age_recent", "age2_recent", "chronic_recent", "household_recent", "have_childs_at_home_recent", "gender_recent" ] correction_df = df_quest.loc[:, correction_columns] # question selection df_selected_questions = pd.read_csv(model_selection_file, sep="\t", index_col="Question") df_selected_questions = df_selected_questions.dropna(subset=["Type"]) question_list = df_selected_questions.index.intersection( df_question_ids.columns) # create output dataframes multiIndex_columns = pd.MultiIndex.from_product( [question_list, df_question_ids.index], names=["question", "quest_nr"]) df_betas_per_question = pd.DataFrame(index=trait_subset.columns, columns=multiIndex_columns) df_pvalues_per_question = pd.DataFrame(index=trait_subset.columns, columns=multiIndex_columns) df_se_values_per_question = pd.DataFrame(index=trait_subset.columns, columns=multiIndex_columns) n_values_per_question = [] value_counts_per_question = [] # process all questions for column in question_list: question_ids_of_question = df_question_ids.loc[:, column] question_ids_of_question = question_ids_of_question[ ~question_ids_of_question.isna()] # create temp lists for model outputs betas_per_week = [] pvalues_per_week = [] se_values_per_week = [] n_values_per_week = {} values_counts_per_week = {} # process all datapoints of the question for index, quest_id in question_ids_of_question.iteritems(): if quest_id in selection_columns: # create model input data df_subset = df_quest.loc[:, quest_id] df_subset = df_subset.astype(float) df_subset = df_subset.dropna() # find the correct model model_type = df_selected_questions.loc[column, "Type"] if model_type == "ordinal": df_subset = df_subset.sort_index() df_subset = df_subset.astype(int).astype("category") model_type = "ordinal" elif model_type == "ordinal-ordered" or model_type == "ordinal-ordered-turned": df_subset = df_subset.sort_index() df_subset = df_subset.astype(int).astype( "category").cat.as_ordered() model_type = "ordinal" # fit all models per trait beta_values_per_model = {} pvalues_per_model = {} se_values_per_model = {} for pgs_column_name in trait_subset.columns: if ((df_question_prs_selection["Question"] == column) & (df_question_prs_selection["prs"] == pgs_column_name)).any(): #write logfile info logfile.write( "\n\nProcess: {question}, {question_id}, {pgs}, {model}\n" .format(question=column, question_id=quest_id, pgs=pgs_column_name, model=model_type)) print("process", column, pgs_column_name) try: # create model and set the model parameters df_model_input = pd.merge( trait_subset.loc[:, [pgs_column_name]], correction_df, left_index=True, right_index=True) fit_args = {} if model_type == "binomial": df_model_input["intercept"] = 1.0 mod = sm.Logit( df_subset, df_model_input.loc[df_subset.index, :]) fit_args["maxiter"] = 10000 elif model_type == "ordinal": mod = OrderedModel( df_subset, df_model_input.loc[df_subset.index, :], distr="logit") fit_args["method"] = 'bfgs' fit_args["maxiter"] = 10000 else: df_model_input["intercept"] = 1.0 mod = sm.OLS( df_subset, df_model_input.loc[df_subset.index, :]) # fit the model res = mod.fit(**fit_args) #write model output print(res.summary()) logfile.write(res.summary().as_text()) logfile.write("\n") #save output from the model beta_values_per_model[ pgs_column_name] = res.params[pgs_column_name] pvalues_per_model[pgs_column_name] = res.pvalues[ pgs_column_name] se_values_per_model[pgs_column_name] = res.bse[ pgs_column_name] except sm.tools.sm_exceptions.PerfectSeparationError: print("Exception PerfectSeparationError", pgs_column_name, quest_id, column) logfile.write( "Error PerfectSeparationError: {question_id}, {pgs}\n" .format( question_id=quest_id, pgs=pgs_column_name, )) continue except np.linalg.LinAlgError: print("Exception LinAlgError", pgs_column_name, quest_id, column) logfile.write( "Error LinAlgError: {question_id}, {pgs}\n". format( question_id=quest_id, pgs=pgs_column_name, )) continue except UnboundLocalError: print("Exception UnboundLocalError", pgs_column_name, quest_id, column) logfile.write( "Error UnboundLocalError: {question_id}, {pgs}\n" .format( question_id=quest_id, pgs=pgs_column_name, )) continue except Exception: print("Exception", pgs_column_name, quest_id, column) logfile.write( "Error Exception (general): {question_id}, {pgs}\n" .format( question_id=quest_id, pgs=pgs_column_name, )) continue # save all the model output per time point model_betas = pd.Series(beta_values_per_model, name=index) betas_per_week.append(model_betas) model_pvalues = pd.Series(pvalues_per_model, name=index) pvalues_per_week.append(model_pvalues) model_se_values = pd.Series(se_values_per_model, name=index) se_values_per_week.append(model_se_values) n_values_per_week[index] = df_subset.shape[0] val_counts = df_subset.value_counts() val_counts.index = val_counts.index.astype(int).astype(str) values_counts_per_week[index] = json.dumps( val_counts.to_dict()) # save all the model information per PRS, per time point, per question (multi column table) if len(betas_per_week) > 0: df_model_betas = pd.concat(betas_per_week, axis=1) df_model_betas.columns = pd.MultiIndex.from_product( [[column], df_model_betas.columns]) df_betas_per_question.loc[df_model_betas.index, df_model_betas.columns] = df_model_betas df_model_pvalues = pd.concat(pvalues_per_week, axis=1) df_model_pvalues.columns = pd.MultiIndex.from_product( [[column], df_model_pvalues.columns]) df_pvalues_per_question.loc[ df_model_pvalues.index, df_model_pvalues.columns] = df_model_pvalues df_model_se_values = pd.concat(se_values_per_week, axis=1) df_model_se_values.columns = pd.MultiIndex.from_product( [[column], df_model_se_values.columns]) df_se_values_per_question.loc[ df_model_se_values.index, df_model_se_values.columns] = df_model_se_values n_values_per_question.append( pd.Series(n_values_per_week, name=column)) value_counts_per_question.append( pd.Series(values_counts_per_week, name=column)) df_nvalues = pd.concat(n_values_per_question, axis=1) df_value_counts = pd.concat(value_counts_per_question, axis=1) logfile.close() print("correlations calculation ready") # save the results per PRS in separated folders for PRS_name in df_betas_per_question.index: print("PRS_name", PRS_name) #get the export data per PRS from the dataframes df_prs_subset = df_betas_per_question.loc[PRS_name, :] df_prs_subset = df_prs_subset.unstack(level=-1) df_prs_subset_pvalues = df_pvalues_per_question.loc[PRS_name, :] df_prs_subset_pvalues = df_prs_subset_pvalues.unstack(level=-1) df_prs_subset_se_values = df_se_values_per_question.loc[PRS_name, :] df_prs_subset_se_values = df_prs_subset_se_values.unstack(level=-1) time_series_info_corr = df_prs_subset.copy() time_series_info_corr.columns = time_series_info_corr.columns.astype( str) # create the PRS output dir output_prs_dir = os.path.join(output_dir_ori_path, PRS_name) create_dir(output_prs_dir) # export the files df_prs_subset_pvalues.columns = df_prs_subset_pvalues.columns.astype( str) export_df(df_prs_subset_pvalues, output_prs_dir, PRS_name, "p_values_{}".format(suffix)) df_prs_subset_se_values.columns = df_prs_subset_se_values.columns.astype( str) export_df(df_prs_subset_se_values, output_prs_dir, PRS_name, "se_values_{}".format(suffix)) df_nvalues = df_nvalues.T print(df_nvalues) df_nvalues.columns = df_nvalues.columns.astype(str) export_df(df_nvalues, output_prs_dir, PRS_name, "n_values_{}".format(suffix)) df_value_counts = df_value_counts.T df_value_counts.columns = df_value_counts.columns.astype(str) export_df(df_value_counts, output_prs_dir, PRS_name, "value_counts_{}".format(suffix)) time_series_info_corr["Question answers"] = df_selected_questions.loc[ time_series_info_corr.index, "Question answers"] time_series_info_corr = time_series_info_corr.loc[:, [ "Question answers", *list(df_prs_subset.columns.astype(str)) ]] export_df(time_series_info_corr, output_prs_dir, PRS_name, "correlations_{}".format(suffix))
def ordinal_regression(x, y, distr="probit"): model = OrderedModel(y, x, distr=distr) result = model.fit(method="bfgs") summary = result.summary() odds_radio = get_odds_radio(result) return result, summary, odds_radio
import pandas from statsmodels.miscmodels.ordinal_model import OrderedModel nobs, k_vars = 1000, 3 x = np.random.randn(nobs, k_vars) # x = np.column_stack((np.ones(nobs), x)) # #constant will be in integration limits xb = x.dot(np.ones(k_vars)) y_latent = xb + np.random.randn(nobs) y = np.round(np.clip(y_latent, -2.4, 2.4)).astype(int) + 2 print(np.unique(y)) print(np.bincount(y)) mod = OrderedModel(y, x) # start_params = np.ones(k_vars + 4) # start_params = np.concatenate((np.ones(k_vars), np.arange(4))) start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum()) start_threshold = np.concatenate( (start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) start_params = np.concatenate((np.zeros(k_vars), start_threshold)) res = mod.fit(start_params=start_params, maxiter=5000, maxfun=5000) print(res.params) # res = mod.fit(start_params=res.params, method='bfgs') res = mod.fit(start_params=start_params, method='bfgs') print(res.params) print(np.exp(res.params[-(mod.k_levels - 1):]).cumsum()) # print(res.summary())
# categorical type, this is preferred over NumPy arrays. # The model is based on a numerical latent variable $y_{latent}$ that we # cannot observe but that we can compute thanks to exogenous variables. # Moreover we can use this $y_{latent}$ to define $y$ that we can observe. # # For more details see the the Documentation of OrderedModel, [the UCLA # webpage](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/) # or this # [book](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470594001). # # ### Probit ordinal regression: mod_prob = OrderedModel(data_student['apply'], data_student[['pared', 'public', 'gpa']], distr='probit') res_prob = mod_prob.fit(method='bfgs') res_prob.summary() # In our model, we have 3 exogenous variables(the $\beta$s if we keep the # documentation's notations) so we have 3 coefficients that need to be # estimated. # # Those 3 estimations and their standard errors can be retrieved in the # summary table. # # Since there are 3 categories in the target variable(`unlikely`, # `somewhat likely`, `very likely`), we have two thresholds to estimate. # As explained in the doc of the method