def plot_interaction(data_lastDV): """ Plot the interaction of the given data (should be three columns) :param data: data frame containing the independent variables in first two columns, dependent in the third :return: None """ col_names = data_lastDV.columns.values # get the columns' names factor_groups = data_lastDV[col_names].dropna() # TODO: fix the boxplot generating a separate plot (why doesn't subplots work?) plt.figure() plt.subplot(121) interaction_plot(factor_groups[col_names[0]], factor_groups[col_names[1]], factor_groups[col_names[2]], colors=['red', 'blue'], markers=['D', '^'], ms=10, ax=plt.gca()) plt.subplot(122) factor_groups.boxplot(return_type='axes', column=col_names[2], by=[col_names[0], col_names[1]]) plt.show()
plt.ylabel('Salary') # From our first look at the data, the difference between Master's and PhD # in the management group is different than in the non-management group. # This is an interaction between the two qualitative variables management,M # and education,E. We can visualize this by first removing the effect of # experience, then plotting the means within each of the 6 groups using # interaction.plot. U = S - X * interX_lm32.params['X'] plt.figure(figsize=(6, 6)) interaction_plot(E, M, U, colors=['red', 'blue'], markers=['^', 'D'], markersize=10, ax=plt.gca()) # ## Minority Employment Data try: jobtest_table = pd.read_table('jobtest.table') except: # don't have data already url = 'http://stats191.stanford.edu/data/jobtest.table' jobtest_table = pd.read_table(url) factor_group = jobtest_table.groupby(['MINORITY']) fig, ax = plt.subplots(figsize=(6, 6))
idx = group.index plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') # drop NA because there is no idx 32 in the final model plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(), ls=lstyle[j], color=colors[i-1]) plt.xlabel('Experience'); plt.ylabel('Salary'); # From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot. U = S - X * interX_lm32.params['X'] plt.figure(figsize=(6,6)) interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'], markersize=10, ax=plt.gca()) # ## Minority Employment Data try: minority_table = pandas.read_table('minority.table') except: # don't have data already url = 'http://stats191.stanford.edu/data/minority.table' minority_table = pandas.read_table(url) factor_group = minority_table.groupby(['ETHN']) plt.figure(figsize=(6,6)) colors = ['purple', 'green'] markers = ['o', 'v']
i, j = values idx = group.index plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i - 1], s=144, edgecolors="black") # drop NA because there is no idx 32 in the final model plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(), ls=lstyle[j], color=colors[i - 1]) plt.xlabel("Experience") # @savefig fitted_drop32.png align=center plt.ylabel("Salary") # From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot. U = S - X * interX_lm32.params["X"] plt.figure(figsize=(6, 6)) # @savefig interaction_plot.png align=center interaction_plot(E, M, U, colors=["red", "blue"], markers=["^", "D"], markersize=10, ax=plt.gca()) # Minority Employment Data # ------------------------ try: minority_table = pandas.read_table("minority.table") except: # don't have data already url = "http://stats191.stanford.edu/data/minority.table" minority_table = pandas.read_table(url) factor_group = minority_table.groupby(["ETHN"]) plt.figure(figsize=(6, 6)) colors = ["purple", "green"] markers = ["o", "v"]
Days Duration Weight ID 0 0.0 1 1 1 1 2.0 1 1 2 2 1.0 1 1 3 3 3.0 1 1 4 4 0.0 1 1 5 r = 3 (weight gain) m = 2 (duration of treatment) n_ij = 10 for all (i, j) """ print("Balanced panel" + "\n") fig = interaction_plot(kidney_table['Weight'], kidney_table['Duration'], np.log(kidney_table['Days'] + 1), colors=['red', 'blue'], markers=['D', '^'], ms=10, ax=plt.gca()) plt.show() formula = "np.log(Days+1) ~ C(Duration)" lm = fit_linear_model(formula, data=kidney_table) formula2 = "np.log(Days+1) ~ C(Weight)" lm2 = fit_linear_model(formula2, data=kidney_table) formula3 = "np.log(Days+1) ~ C(Duration) + C(Weight)" lm3 = fit_linear_model(formula3, data=kidney_table) formula4 = "np.log(Days+1) ~ C(Duration) * C(Weight)"
plt.ylabel("Salary") # From our first look at the data, the difference between Master's and PhD # in the management group is different than in the non-management group. # This is an interaction between the two qualitative variables management,M # and education,E. We can visualize this by first removing the effect of # experience, then plotting the means within each of the 6 groups using # interaction.plot. U = S - X * interX_lm32.params["X"] plt.figure(figsize=(6, 6)) interaction_plot(E, M, U, colors=["red", "blue"], markers=["^", "D"], markersize=10, ax=plt.gca()) # ## Minority Employment Data try: jobtest_table = pd.read_table("jobtest.table") except: # do not have data already url = "http://stats191.stanford.edu/data/jobtest.table" jobtest_table = pd.read_table(url) factor_group = jobtest_table.groupby(["MINORITY"]) fig, ax = plt.subplots(figsize=(6, 6))
# # Measurement of fetal head circumference **hs**, by four observers in three fetuses. # In[55]: # https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/altman_12_6.txt df = pd.read_csv('../data/altman_12_6.txt', names=['hs', 'fetus', 'observer']) df.head() # In[56]: from statsmodels.graphics.api import interaction_plot plt.figure(figsize=(8, 6)) fig = interaction_plot(df['fetus'], df['observer'], df['hs'], ms=10, ax=plt.gca()) # In[169]: formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() print(anova_lm(lm)) # # ### 卡方检验 A chi-squared test # # https://en.wikipedia.org/wiki/Chi-squared_test # #
if len(groups) == 2: X = data[X] Y = data[S] s = 100 plt.figure(figsize=(8, 6)) groups = data.groupby(data[E]) for key, group in groups: # ERROR (working on it) interaction_plot(X, group, np.log(Y + 1), colors=['r', 'b'], markers=['D', '^'], ms=10, ax=plt.gca()) plt.show() #? else: fig, ax = plt.subplots(figsize=(8, 6)) s = 100 for key, group in groups: # ERROR (working on it) group.plot(ax=ax,
gese.loc[gese.stressful_life_events > 10, 'events_cat'] = 10 # Recode labels ############################################################################################# # Simple plotting import matplotlib.pyplot as plt import seaborn # Histograms plt.hist(gese.depression) # slightly right skewed plt.hist(gese.gene) # unequally distributed plt.hist(gese.stressful_life_events) # heavily right skewed ############################################################################################# # Testing for interaction from statsmodels.graphics.api import interaction_plot fig = interaction_plot(gese.stressful_life_events, gese.gene, gese.depression) plt.show( ) # The plot does indicate an interaction between gene and stressful life events # Correlation pearsoncorr = stats.pearsonr( gese.stressful_life_events, gese.depression ) # p-value (<.05) indicates a significant correlation between stressful life events and depression outcome # t test t_test = stats.ttest_ind(gese.depression[gese.gene == 1], gese.depression[gese.gene == 0]) print( t_test_gene ) # p-value (>.05) indicates mean of depression is not significantly different in these two genotypes
anova_lm_check(res_lm_subset, res_lm_interaction_M_subset) """ df_resid ssr df_diff ss_diff F Pr(>F) 0 40.0 4.320910e+07 0.0 NaN NaN NaN 1 38.0 1.711881e+05 2.0 4.303791e+07 4776.734853 2.291239e-46 """ resid_studentized_subset = plot_residuals_studentized( result=res_lm_interaction_M_subset, data=salary_table) # fitted value plotting plot_fitted_values(formula=formula_interaction_M, data=salary_table, drop_idx=drop_idx) # the difference between Master's and PhD in the management group is different # than in the non-management group. (interaction between the two qualitative variables M and E) # => first remove the effect of experience, # => then plot the means within each of the 6 groups using interaction.plot. U = salary_table.S - salary_table.X * res_lm_interaction_X_subset.params[ 'X'] # Interaction plot for factor level statistics. interaction_plot(x=salary_table.E, trace=salary_table.M, response=U, colors=['red', 'blue'], markers=['^', 'D'], markersize=10, ax=plt.gca()) plt.show()