import numpy as np import pingouin as pg import seaborn as sns from statsmodels.stats.multicomp import pairwise_tukeyhsd import csv df = pd.read_csv("Datos.csv", index_col=None,usecols=[1,2,3,4,8],dtype={'generador': 'category', 'algoritmo_flujo': 'category','vertices': 'category','aristas': 'category', 'mediana': np.float64} ) logX = np.log1p(df['mediana']) df = df.assign(mediana_log=logX.values) df.drop(['mediana'], axis= 1, inplace= True) factores=["vertices","generador","aristas","algoritmo_flujo"] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['mediana_log'].groupby(df[i]))) anova = pg.anova (dv='mediana_log', between=i, data=df, detailed=True) pg._export_table (anova,("ANOVA"+i+".csv")) ax=sns.boxplot(x=df["mediana_log"], y=df[i], data=df, palette="Set1") plt.savefig("boxplot_"+ i+".png", bbox_inches='tight') plt.savefig("boxplot_" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog = df["mediana_log"], groups= df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Time', ylabel=i) plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red") plt.savefig("simultaneous_tukey"+ i+".png", bbox_inches='tight') plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("Tukey"+i+".csv", 'w') with t_csv: writer = csv.writer(t_csv) writer.writerows(tukey.summary()) plt.show()
df.drop(['Mediana'], axis=1, inplace=True) factores = [ "Grado", "CoefAg", "CentCer", "CentCag", "Excentricidad", "PageRag" ] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['FlujoMax'].groupby(df[i]))) anova = pg.anova( dv='FlujoMax', between=i, data=df, detailed=True, ) pg._export_table(anova, ("ANOVAsFlujoMax" + i + ".csv")) ax = sns.boxplot(x=df["FlujoMax"], y=df[i], data=df, palette="cubehelix") plt.savefig("boxplot_FlujoMax" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog=df["FlujoMax"], groups=df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Flujo Maximo', ylabel=i) plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("TukeyFlujoMax" + i + ".csv", 'w') with t_csv: writer = csv.writer(t_csv) writer.writerows(tukey.summary())
def friedman(dv=None, within=None, subject=None, data=None, export_filename=None): """Friedman test for repeated measurements. Parameters ---------- dv : string Name of column containing the dependant variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. data : pandas DataFrame DataFrame export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Test summary :: 'Q' : The Friedman Q statistic, corrected for ties 'p-unc' : Uncorrected p-value 'dof' : degrees of freedom Notes ----- The Friedman test is used for one-way repeated measures ANOVA by ranks. Data are expected to be in long-format. Note that if the dataset contains one or more other within subject factors, an automatic collapsing to the mean is applied on the dependant variable (same behavior as the ezANOVA R package). As such, results can differ from those of JASP. If you can, always double-check the results. Due to the assumption that the test statistic has a chi squared distribution, the p-value is only reliable for n > 10 and more than 6 repeated measurements. NaN values are automatically removed. Examples -------- Compute the Friedman test for repeated measurements. >>> from pingouin import friedman, read_dataset >>> df = read_dataset('rm_anova') >>> friedman(dv='DesireToKill', within='Disgustingness', ... subject='Subject', data=df) Source ddof1 Q p-unc Friedman Disgustingness 1 9.228 0.002384 """ # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Collapse to the mean data = data.groupby([subject, within]).mean().reset_index() # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Extract number of groups and total sample size grp = data.groupby(within)[dv] rm = list(data[within].unique()) k = len(rm) X = np.array([grp.get_group(r).values for r in rm]).T n = X.shape[0] # Rank per subject ranked = np.zeros(X.shape) for i in range(n): ranked[i] = scipy.stats.rankdata(X[i, :]) ssbn = (ranked.sum(axis=0)**2).sum() # Compute the test statistic Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1) # Correct for ties ties = 0 for i in range(n): replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / float(k * (k * k - 1) * n) Q /= c # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame({'Source': within, 'ddof1': ddof1, 'Q': np.round(Q, 3), 'p-unc': p_unc, }, index=['Friedman']) col_order = ['Source', 'ddof1', 'Q', 'p-unc'] stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) # Export to .csv if export_filename is not None: _export_table(stats, export_filename) return stats
def cochran(dv=None, within=None, subject=None, data=None, export_filename=None): """Cochran Q test. Special case of the Friedman test when the dependant variable is binary. Parameters ---------- dv : string Name of column containing the binary dependant variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. data : pandas DataFrame DataFrame export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Test summary :: 'Q' : The Cochran Q statistic 'p-unc' : Uncorrected p-value 'dof' : degrees of freedom Notes ----- The Cochran Q Test is a non-parametric test for ANOVA with repeated measures where the dependent variable is binary. Data are expected to be in long-format. NaN are automatically removed from the data. The Q statistics is defined as: .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2} where :math:`N` is the total sum of all observations, :math:`j=1,...,r` where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where :math:`n` is the number of observations per condition. The p-value is then approximated using a chi-square distribution with :math:`r-1` degrees of freedom: .. math:: Q \\sim \\chi^2(r-1) References ---------- .. [1] Cochran, W.G., 1950. The comparison of percentages in matched samples. Biometrika 37, 256–266. https://doi.org/10.1093/biomet/37.3-4.256 Examples -------- Compute the Cochran Q test for repeated measurements. >>> from pingouin import cochran, read_dataset >>> df = read_dataset('cochran') >>> cochran(dv='Energetic', within='Time', subject='Subject', data=df) Source dof Q p-unc cochran Time 2 6.706 0.034981 """ # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Groupby and extract size grp = data.groupby(within)[dv] grp_s = data.groupby(subject)[dv] k = data[within].nunique() dof = k - 1 # n = grp.count().unique()[0] # Q statistic and p-value q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \ (k * grp.sum().sum() - np.sum(grp_s.sum()**2)) p_unc = scipy.stats.chi2.sf(q, dof) # Create output dataframe stats = pd.DataFrame({'Source': within, 'dof': dof, 'Q': np.round(q, 3), 'p-unc': p_unc, }, index=['cochran']) # Export to .csv if export_filename is not None: _export_table(stats, export_filename) return stats
def kruskal(dv=None, between=None, data=None, detailed=False, export_filename=None): """Kruskal-Wallis H-test for independent samples. Parameters ---------- dv : string Name of column containing the dependant variable. between : string Name of column containing the between factor. data : pandas DataFrame DataFrame export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Test summary :: 'H' : The Kruskal-Wallis H statistic, corrected for ties 'p-unc' : Uncorrected p-value 'dof' : degrees of freedom Notes ----- The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. Due to the assumption that H has a chi square distribution, the number of samples in each group must not be too small. A typical rule is that each sample must have at least 5 measurements. NaN values are automatically removed. Examples -------- Compute the Kruskal-Wallis H-test for independent samples. >>> from pingouin import kruskal, read_dataset >>> df = read_dataset('anova') >>> kruskal(dv='Pain threshold', between='Hair color', data=df) Source ddof1 H p-unc Kruskal Hair color 3 10.589 0.014172 """ # Check data _check_dataframe(dv=dv, between=between, data=data, effects='between') # Remove NaN values data = data.dropna() # Reset index (avoid duplicate axis error) data = data.reset_index(drop=True) # Extract number of groups and total sample size groups = list(data[between].unique()) n_groups = len(groups) n = data[dv].size # Rank data, dealing with ties appropriately data['rank'] = scipy.stats.rankdata(data[dv]) # Find the total of rank per groups grp = data.groupby(between)['rank'] sum_rk_grp = grp.sum().values n_per_grp = grp.count().values # Calculate chi-square statistic (H) H = (12 / (n * (n + 1)) * np.sum(sum_rk_grp**2 / n_per_grp)) - 3 * (n + 1) # Correct for ties H /= scipy.stats.tiecorrect(data['rank'].values) # Calculate DOF and p-value ddof1 = n_groups - 1 p_unc = scipy.stats.chi2.sf(H, ddof1) # Create output dataframe stats = pd.DataFrame({'Source': between, 'ddof1': ddof1, 'H': np.round(H, 3), 'p-unc': p_unc, }, index=['Kruskal']) col_order = ['Source', 'ddof1', 'H', 'p-unc'] stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) # Export to .csv if export_filename is not None: _export_table(stats, export_filename) return stats