예제 #1
0
    def setup_class(cls):
        data = ds.df
        nobs = len(data)
        data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)
        # alias to correspond to patsy name
        data["C(dummy)[T.1.0]"] = data["dummy"]
        cls.data = data

        columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa']
        # standard fit
        mod = OrderedModel(data['apply'].values.codes,
                           np.asarray(data[columns], float),
                           distr='logit')
        cls.res = mod.fit(method='bfgs', disp=False)
        # standard fit with pandas input
        modp = OrderedModel(data['apply'],
                            data[columns],
                            distr='logit')
        cls.resp = modp.fit(method='bfgs', disp=False)
예제 #2
0
def main():
    ###
    ### input paths
    ###
    input_df_path = sys.argv[
        1]  # dataframe with the combined questionnaire data
    input_question_overview = sys.argv[
        2]  # datafrane with the question ids over time
    input_pgs_path = sys.argv[
        3]  # dataframe with the PGS values of the participants
    output_dir_ori_path = sys.argv[4]  # output directory path
    suffix = sys.argv[5]  # suffix fot the outputs
    pgs_id = int(
        sys.argv[7]
    )  # int of pgs number, used for multi node analysis on cluster
    question_prs_selection_path = sys.argv[
        6]  # input path of question and prs selection
    selected_trait_file_path = sys.argv[
        7]  # input path of trait selection file
    model_selection_file = sys.argv[8]  # input path of trait selection file
    df_question_prs_selection = pd.read_csv(question_prs_selection_path,
                                            sep="\t")

    # create output directory
    create_dir(output_dir_ori_path)

    # create log file
    log_file_path = "log_file_{id}_{suffix}_{date}.txt".format(
        id=pgs_id, suffix=suffix, date=datetime.now().strftime("%d-%m-%Y"))
    logfile = open(os.path.join(output_dir_ori_path, log_file_path), "w")

    # read input files
    df_quest = pd.read_pickle(input_df_path)
    df_question_ids_total = pd.read_csv(input_question_overview,
                                        sep="\t",
                                        index_col=0,
                                        dtype="str")
    df_question_ids = df_question_ids_total.loc[:,
                                                df_question_ids_total.columns.
                                                difference([
                                                    "Number of timepoints",
                                                    "Question answers"
                                                ])]
    df_question_ids.columns = df_question_ids.columns.astype(float)
    df_question_ids = df_question_ids.T
    df_question_ids = df_question_ids.sort_index()
    trait_subset = pd.read_pickle(input_pgs_path)

    # Filter questionnaire question data on meta information
    start_columns = df_quest.columns[df_quest.columns.str.startswith("covt")]
    date_columns = df_quest.columns[df_quest.columns.str.endswith("DATE")]
    age_columns = df_quest.columns[df_quest.columns.str.endswith("AGE")]
    gender_columns = df_quest.columns[df_quest.columns.str.endswith("GENDER")]
    date_variant_id = df_quest.columns[df_quest.columns.str.endswith(
        "VARIANT_ID")]
    date_zip_code = df_quest.columns[df_quest.columns.str.endswith("ZIP_CODE")]
    date_response_rate = df_quest.columns[df_quest.columns.str.contains(
        "responsedate")]
    skip_columns = [
        "covt17_COVID172TXT", "covt17_COVID177TXT", "covt17_COVID192A",
        "covt17_COVID192B", "covt17_PSEUDOIDEXT"
    ]
    selection_columns = start_columns.difference(date_columns).difference(
        age_columns).difference(gender_columns).difference(
            date_variant_id).difference(date_zip_code).difference(
                date_response_rate).difference(skip_columns)
    df_quest = df_quest.loc[df_quest.index.intersection(trait_subset.index), :]

    # trait selection
    trait_subset.columns = trait_subset.columns.str.replace(
        "/", ".").str.replace(" ", ".").str.replace("-", ".").str.replace(
            "(", ".").str.replace(")", ".")
    df_selected_traits = pd.read_csv(selected_trait_file_path, header=None)
    trait_subset = trait_subset.loc[:,
                                    trait_subset.columns.
                                    intersection(df_selected_traits.iloc[:,
                                                                         0])]
    trait_subset = trait_subset.iloc[:, [pgs_id]]
    print("Process trade: {}".format(trait_subset.columns[0]))

    #  model covariate columns
    correction_columns = [
        "age_recent", "age2_recent", "chronic_recent", "household_recent",
        "have_childs_at_home_recent", "gender_recent"
    ]
    correction_df = df_quest.loc[:, correction_columns]

    # question selection
    df_selected_questions = pd.read_csv(model_selection_file,
                                        sep="\t",
                                        index_col="Question")
    df_selected_questions = df_selected_questions.dropna(subset=["Type"])
    question_list = df_selected_questions.index.intersection(
        df_question_ids.columns)

    # create output dataframes
    multiIndex_columns = pd.MultiIndex.from_product(
        [question_list, df_question_ids.index], names=["question", "quest_nr"])
    df_betas_per_question = pd.DataFrame(index=trait_subset.columns,
                                         columns=multiIndex_columns)
    df_pvalues_per_question = pd.DataFrame(index=trait_subset.columns,
                                           columns=multiIndex_columns)
    df_se_values_per_question = pd.DataFrame(index=trait_subset.columns,
                                             columns=multiIndex_columns)
    n_values_per_question = []
    value_counts_per_question = []

    # process all questions
    for column in question_list:
        question_ids_of_question = df_question_ids.loc[:, column]
        question_ids_of_question = question_ids_of_question[
            ~question_ids_of_question.isna()]

        # create temp lists for model outputs
        betas_per_week = []
        pvalues_per_week = []
        se_values_per_week = []
        n_values_per_week = {}
        values_counts_per_week = {}
        # process all datapoints of the question
        for index, quest_id in question_ids_of_question.iteritems():
            if quest_id in selection_columns:

                # create model input data
                df_subset = df_quest.loc[:, quest_id]
                df_subset = df_subset.astype(float)
                df_subset = df_subset.dropna()

                # find the correct model
                model_type = df_selected_questions.loc[column, "Type"]
                if model_type == "ordinal":
                    df_subset = df_subset.sort_index()
                    df_subset = df_subset.astype(int).astype("category")
                    model_type = "ordinal"
                elif model_type == "ordinal-ordered" or model_type == "ordinal-ordered-turned":
                    df_subset = df_subset.sort_index()
                    df_subset = df_subset.astype(int).astype(
                        "category").cat.as_ordered()
                    model_type = "ordinal"

                # fit all models per trait
                beta_values_per_model = {}
                pvalues_per_model = {}
                se_values_per_model = {}
                for pgs_column_name in trait_subset.columns:
                    if ((df_question_prs_selection["Question"] == column) &
                        (df_question_prs_selection["prs"]
                         == pgs_column_name)).any():

                        #write logfile info
                        logfile.write(
                            "\n\nProcess: {question}, {question_id}, {pgs}, {model}\n"
                            .format(question=column,
                                    question_id=quest_id,
                                    pgs=pgs_column_name,
                                    model=model_type))
                        print("process", column, pgs_column_name)
                        try:
                            # create model and set the model parameters
                            df_model_input = pd.merge(
                                trait_subset.loc[:, [pgs_column_name]],
                                correction_df,
                                left_index=True,
                                right_index=True)
                            fit_args = {}
                            if model_type == "binomial":
                                df_model_input["intercept"] = 1.0
                                mod = sm.Logit(
                                    df_subset,
                                    df_model_input.loc[df_subset.index, :])
                                fit_args["maxiter"] = 10000
                            elif model_type == "ordinal":
                                mod = OrderedModel(
                                    df_subset,
                                    df_model_input.loc[df_subset.index, :],
                                    distr="logit")
                                fit_args["method"] = 'bfgs'
                                fit_args["maxiter"] = 10000
                            else:
                                df_model_input["intercept"] = 1.0
                                mod = sm.OLS(
                                    df_subset,
                                    df_model_input.loc[df_subset.index, :])

                            # fit the model
                            res = mod.fit(**fit_args)

                            #write model output
                            print(res.summary())
                            logfile.write(res.summary().as_text())
                            logfile.write("\n")

                            #save output from the model
                            beta_values_per_model[
                                pgs_column_name] = res.params[pgs_column_name]
                            pvalues_per_model[pgs_column_name] = res.pvalues[
                                pgs_column_name]
                            se_values_per_model[pgs_column_name] = res.bse[
                                pgs_column_name]
                        except sm.tools.sm_exceptions.PerfectSeparationError:
                            print("Exception PerfectSeparationError",
                                  pgs_column_name, quest_id, column)
                            logfile.write(
                                "Error PerfectSeparationError: {question_id}, {pgs}\n"
                                .format(
                                    question_id=quest_id,
                                    pgs=pgs_column_name,
                                ))
                            continue
                        except np.linalg.LinAlgError:
                            print("Exception LinAlgError", pgs_column_name,
                                  quest_id, column)
                            logfile.write(
                                "Error LinAlgError: {question_id}, {pgs}\n".
                                format(
                                    question_id=quest_id,
                                    pgs=pgs_column_name,
                                ))
                            continue
                        except UnboundLocalError:
                            print("Exception UnboundLocalError",
                                  pgs_column_name, quest_id, column)
                            logfile.write(
                                "Error UnboundLocalError: {question_id}, {pgs}\n"
                                .format(
                                    question_id=quest_id,
                                    pgs=pgs_column_name,
                                ))
                            continue
                        except Exception:
                            print("Exception", pgs_column_name, quest_id,
                                  column)
                            logfile.write(
                                "Error Exception (general): {question_id}, {pgs}\n"
                                .format(
                                    question_id=quest_id,
                                    pgs=pgs_column_name,
                                ))
                            continue

                # save all the model output per time point
                model_betas = pd.Series(beta_values_per_model, name=index)
                betas_per_week.append(model_betas)

                model_pvalues = pd.Series(pvalues_per_model, name=index)
                pvalues_per_week.append(model_pvalues)

                model_se_values = pd.Series(se_values_per_model, name=index)
                se_values_per_week.append(model_se_values)

                n_values_per_week[index] = df_subset.shape[0]

                val_counts = df_subset.value_counts()
                val_counts.index = val_counts.index.astype(int).astype(str)
                values_counts_per_week[index] = json.dumps(
                    val_counts.to_dict())
        # save all the model information per PRS, per time point, per question (multi column table)
        if len(betas_per_week) > 0:
            df_model_betas = pd.concat(betas_per_week, axis=1)
            df_model_betas.columns = pd.MultiIndex.from_product(
                [[column], df_model_betas.columns])
            df_betas_per_question.loc[df_model_betas.index,
                                      df_model_betas.columns] = df_model_betas

            df_model_pvalues = pd.concat(pvalues_per_week, axis=1)
            df_model_pvalues.columns = pd.MultiIndex.from_product(
                [[column], df_model_pvalues.columns])
            df_pvalues_per_question.loc[
                df_model_pvalues.index,
                df_model_pvalues.columns] = df_model_pvalues

            df_model_se_values = pd.concat(se_values_per_week, axis=1)
            df_model_se_values.columns = pd.MultiIndex.from_product(
                [[column], df_model_se_values.columns])
            df_se_values_per_question.loc[
                df_model_se_values.index,
                df_model_se_values.columns] = df_model_se_values

            n_values_per_question.append(
                pd.Series(n_values_per_week, name=column))
            value_counts_per_question.append(
                pd.Series(values_counts_per_week, name=column))

    df_nvalues = pd.concat(n_values_per_question, axis=1)
    df_value_counts = pd.concat(value_counts_per_question, axis=1)
    logfile.close()

    print("correlations calculation ready")

    # save the results per PRS in separated folders
    for PRS_name in df_betas_per_question.index:
        print("PRS_name", PRS_name)
        #get the export data per PRS from the dataframes
        df_prs_subset = df_betas_per_question.loc[PRS_name, :]
        df_prs_subset = df_prs_subset.unstack(level=-1)

        df_prs_subset_pvalues = df_pvalues_per_question.loc[PRS_name, :]
        df_prs_subset_pvalues = df_prs_subset_pvalues.unstack(level=-1)

        df_prs_subset_se_values = df_se_values_per_question.loc[PRS_name, :]
        df_prs_subset_se_values = df_prs_subset_se_values.unstack(level=-1)

        time_series_info_corr = df_prs_subset.copy()
        time_series_info_corr.columns = time_series_info_corr.columns.astype(
            str)

        # create the PRS output dir
        output_prs_dir = os.path.join(output_dir_ori_path, PRS_name)
        create_dir(output_prs_dir)

        # export the files
        df_prs_subset_pvalues.columns = df_prs_subset_pvalues.columns.astype(
            str)
        export_df(df_prs_subset_pvalues, output_prs_dir, PRS_name,
                  "p_values_{}".format(suffix))

        df_prs_subset_se_values.columns = df_prs_subset_se_values.columns.astype(
            str)
        export_df(df_prs_subset_se_values, output_prs_dir, PRS_name,
                  "se_values_{}".format(suffix))

        df_nvalues = df_nvalues.T
        print(df_nvalues)
        df_nvalues.columns = df_nvalues.columns.astype(str)
        export_df(df_nvalues, output_prs_dir, PRS_name,
                  "n_values_{}".format(suffix))

        df_value_counts = df_value_counts.T
        df_value_counts.columns = df_value_counts.columns.astype(str)
        export_df(df_value_counts, output_prs_dir, PRS_name,
                  "value_counts_{}".format(suffix))

        time_series_info_corr["Question answers"] = df_selected_questions.loc[
            time_series_info_corr.index, "Question answers"]
        time_series_info_corr = time_series_info_corr.loc[:, [
            "Question answers", *list(df_prs_subset.columns.astype(str))
        ]]
        export_df(time_series_info_corr, output_prs_dir, PRS_name,
                  "correlations_{}".format(suffix))
예제 #3
0
def ordinal_regression_formula(data, formula, distr="probit"):
    model = OrderedModel.from_formula(formula=formula, data=data, distr=distr)
    result = model.fit(method="bfgs")
    summary = result.summary()
    odds_radio = get_odds_radio(result)
    return result, summary, odds_radio
예제 #4
0
def ordinal_regression(x, y, distr="probit"):
    model = OrderedModel(y, x, distr=distr)
    result = model.fit(method="bfgs")
    summary = result.summary()
    odds_radio = get_odds_radio(result)
    return result, summary, odds_radio
예제 #5
0
data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/政治意识与党派.csv")
# data['意识形态']=data['意识形态'].replace({'很自由':1,'有点自由':2,'中等':3,'有点保守':4,'很保守':5})
data['政治党派'] = data['政治党派'].replace({'民主党人': 1, '共和党人': 0})
tmp = pd.DataFrame()
for i in range(0, 20):
    tmp = tmp.append([data.loc[i]] * data.iloc[i]['值'])
tmp = tmp.reset_index()
del tmp['值']
del tmp['index']
# tmp.to_csv(r'D:/书籍资料整理/属性数据分析/政治意识与党派_整理数据.csv')
#得到的结果显示,自变量参数是反的.这个可以解释,因为使用的是α-βx展示
#书中的结果是α+βx
#但是截距从第二个开始就相去甚远很难找到解释理由,OrderedModel这个功能
#并非包内本身带的,文档也几乎没有提到.
#这个是将要被statsmodels带入的功能并没有完善待后续.
tmp['意识形态'] = tmp['意识形态'].astype('category')
s = pd.Series(["a", "b", "c", "a", "d", "e"])
cat_type = CategoricalDtype(categories=['很自由', '有点自由', '中等', '有点保守', '很保守'],
                            ordered=True)  #categories必须是一个列表
tmp['意识形态'] = tmp['意识形态'].astype(cat_type)

modf_logit = OrderedModel.from_formula("意识形态~政治党派", tmp, distr='logit')
resf_logit = modf_logit.fit(method='bfgs')
print(resf_logit.summary())

data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/心灵伤害与SES.csv")

data['心理伤害'] = data['心理伤害'].replace({'健康': 0, '轻度': 1, '中等': 2, '受损': 3})
modf_logit = OrderedModel.from_formula("心理伤害~SES+生活事件", data, distr='logit')
resf_logit = modf_logit.fit()
resf_logit.summary()
예제 #6
0
import pandas

from statsmodels.miscmodels.ordinal_model import OrderedModel

nobs, k_vars = 1000, 3
x = np.random.randn(nobs, k_vars)
# x = np.column_stack((np.ones(nobs), x))
# #constant will be in integration limits
xb = x.dot(np.ones(k_vars))
y_latent = xb + np.random.randn(nobs)
y = np.round(np.clip(y_latent, -2.4, 2.4)).astype(int) + 2

print(np.unique(y))
print(np.bincount(y))

mod = OrderedModel(y, x)
# start_params = np.ones(k_vars + 4)
# start_params = np.concatenate((np.ones(k_vars), np.arange(4)))
start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum())
start_threshold = np.concatenate(
    (start_ppf[:1], np.log(np.diff(start_ppf[:-1]))))
start_params = np.concatenate((np.zeros(k_vars), start_threshold))
res = mod.fit(start_params=start_params, maxiter=5000, maxfun=5000)
print(res.params)
# res = mod.fit(start_params=res.params, method='bfgs')
res = mod.fit(start_params=start_params, method='bfgs')

print(res.params)
print(np.exp(res.params[-(mod.k_levels - 1):]).cumsum())
# print(res.summary())
예제 #7
0
# categorical type, this is preferred over NumPy arrays.

# The model is based on a numerical latent variable $y_{latent}$ that we
# cannot observe but that we can compute thanks to exogenous variables.
# Moreover we can use this $y_{latent}$ to define $y$ that we can observe.
#
# For more details see the the Documentation of OrderedModel,  [the UCLA
# webpage](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/)
# or this
# [book](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470594001).
#

# ### Probit ordinal regression:

mod_prob = OrderedModel(data_student['apply'],
                        data_student[['pared', 'public', 'gpa']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

# In our model, we have 3 exogenous variables(the $\beta$s if we keep the
# documentation's notations) so we have 3 coefficients that need to be
# estimated.
#
# Those 3 estimations and their standard errors can be retrieved in the
# summary table.
#
# Since there are 3 categories in the target variable(`unlikely`,
# `somewhat likely`, `very likely`), we have two thresholds to estimate.
# As explained in the doc of the method