Exemplo n.º 1
0
    def plot_main_effects(self, intensity):

        if intensity[-1] != "%":
            intensity += "%"

        plt.subplots(3, 1, figsize=(12, 7))
        plt.suptitle("{} Activity".format(intensity.capitalize()))

        plt.subplot(1, 3, 1)
        model_means = rp.summary_cont(self.df_percent.groupby(
            ['Model']))[intensity]["Mean"]
        model_sd = rp.summary_cont(self.df_percent.groupby(
            ['Model']))[intensity]["SD"]
        plt.bar([i for i in model_sd.index],
                [100 * i for i in model_means.values],
                yerr=[i * 100 for i in model_sd],
                capsize=10,
                ecolor='black',
                color=["Red", "Blue", "Green", "Purple"],
                edgecolor='black',
                linewidth=2)
        plt.ylabel("% of Collection")
        plt.title("Model Means")

        plt.subplot(1, 3, 2)
        group_means = rp.summary_cont(self.df_percent.groupby(
            ['Group']))[intensity]["Mean"]
        group_sd = rp.summary_cont(self.df_percent.groupby(
            ['Group']))[intensity]["SD"]
        plt.bar([i for i in group_means.index],
                [100 * i for i in group_means.values],
                yerr=[i * 100 for i in group_sd],
                capsize=10,
                ecolor='black',
                color=["Grey", "White"],
                edgecolor='black',
                linewidth=2)
        plt.title("Group Means")

        plt.subplot(1, 3, 3)
        sns.pointplot(data=x.df_percent,
                      x="Model",
                      y=intensity,
                      hue="Group",
                      dodge=True,
                      markers='o',
                      capsize=.1,
                      errwidth=1,
                      palette='Set1')
        plt.title("All Combination Means")
        plt.ylabel(" ")
Exemplo n.º 2
0
    def plot_activity_group_means(self):

        ci_range = sms.DescrStatsW(
            self.df_kappa_long.groupby("Group").get_group("LOW")
            ["Kappa"]).tconfint_mean()
        ci_width_h = (ci_range[1] - ci_range[0]) / 2

        ci_range = sms.DescrStatsW(
            self.df_kappa_long.groupby("Group").get_group("HIGH")
            ["Kappa"]).tconfint_mean()
        ci_width_l = (ci_range[1] - ci_range[0]) / 2

        e_bars = [ci_width_h, ci_width_l]

        group_means = rp.summary_cont(self.df_kappa_long.groupby(
            ['Group']))["Kappa"]["Mean"]

        plt.bar(["LOW", "HIGH"], [group_means["LOW"], group_means["HIGH"]],
                yerr=[i for i in e_bars],
                capsize=8,
                ecolor='black',
                color=["white", "dimgrey"],
                edgecolor='black',
                alpha=0.5,
                linewidth=2)
        plt.title("Cohen's Kappa by Activity Group")

        plt.ylabel("Cohen's Kappa")
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.yticks(fontsize=10)
Exemplo n.º 3
0
def two_way_anova(xs: tuple,
                  ys: tuple,
                  values: tuple,
                  replications,
                  stds: tuple = None,
                  log_transform=True):
    with pd.option_context('display.max_rows', 100):
        xname, xlevels = xs
        yname, ylevels = ys
        dname, data = values

        y = np.repeat(ylevels, replications)
        for i in range(1, len(xlevels)):
            thing = np.repeat(ylevels, replications)
            y = np.concatenate((y, thing))

        x = np.repeat(xlevels, len(ylevels) * replications)

        df = pd.DataFrame({dname: data, xname: x, yname: y})

        if stds:
            df[stds[0]] = stds[1]
            # Rearrange
            df = df[[dname, stds[0], xname, yname]]

        df[dname] = df[dname].astype(np.float)
        print("=" * 30)
        print("Original data")
        print(df)

        # Remove stds again
        if stds:
            del df[stds[0]]

        if log_transform:
            print("=" * 30)
            print("LN Transformed data")
            df[dname] = np.log(df[dname])
            print(df)

        print(rp.summary_cont(df.groupby([xname, yname]))[dname])

        model = ols(f"{dname}~ C({xname})*C({yname})", df).fit()
        # Seeing if the overall model is significant
        print("=" * 30)
        print(
            f"Overall model F({model.df_model:.0f},{model.df_resid:.0f}) = {model.fvalue:.3f}, "
            f"p = {model.f_pvalue:.4f}")
        print(model.summary())

        print("=" * 30)
        print("ANOVA")
        res = sm.stats.anova_lm(model, typ=2)
        print(res)
Exemplo n.º 4
0
    def plot_mains_effects_kappa(self):

        e_bars = [self.df_kappa_ci[1], self.df_kappa_ci[0]]

        group_means = rp.summary_cont(self.df_kappa_long.groupby(
            ['Group']))["Kappa"]["Mean"]

        plt.bar(["LOW", "HIGH"], [group_means["LOW"], group_means["HIGH"]],
                yerr=[i for i in e_bars],
                capsize=8,
                ecolor='black',
                color=["white", "dimgrey"],
                edgecolor='black',
                alpha=0.5,
                linewidth=2)
        plt.title("Cohen's Kappa by Activity Group")

        plt.ylabel("Kappa")
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.yticks(fontsize=10)
Exemplo n.º 5
0
print(my_df.info())
print(my_df.describe())
print(my_df.shape)

# It looks like we have 5497 observations (examples) and 6 features (dimensions) but one of those dimensions
# is the unique identifier.

# Notice how we are missing data in the currentsalary data column
# We have to decide what to do with the missing data.
# We could delete the examples/observations or we could impute (estimate)
# the missing values.  This decision is not made in isolation by a single individual.
# Let's use the average for the low, medium and high groups to impute the missing values.
print(my_df.isnull().sum())

pd.set_option('display.max_columns', 10)
rp.summary_cont(my_df['currentsalary'].groupby(my_df['flightrisk']))

my_df1 = my_df.query('flightrisk=="high"')
my_avg1 = my_df1['currentsalary'].mean()
#my_df1['currentsalary'] = my_df1['currentsalary'].fillna(my_avg1)
my_df1['currentsalary'].fillna(my_avg1, inplace=True)
print(my_df1)

my_df2 = my_df.query('flightrisk=="medium"')
my_avg2 = my_df2['currentsalary'].mean()
my_df2['currentsalary'].fillna(my_avg2, inplace=True)
print(my_df2)

my_df3 = my_df.query('flightrisk=="low"')
my_avg3 = my_df3['currentsalary'].mean()
my_df3['currentsalary'].fillna(my_avg3, inplace=True)
Exemplo n.º 6
0
import pandas as pd
import matplotlib.pyplot as plt
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

data_frame = pd.read_csv('/home/normie/Documents/advent/2020-1-23-HMB312-Lab3-AM-PlaqueCount.csv')
data_frame = data_frame.drop('Group # ', axis=1)
data_frame = data_frame.drop('TA', axis=1)
data_frame = data_frame.rename(columns={'Hippocampus Amyloid Counts': 'hippocampus', 'Cerebellum Amyloid Counts': 'cerebellum', 'Cortex Amyloid Counts': 'cortex'})

hipposummary = rp.summary_cont(data_frame['hippocampus'].groupby(data_frame['Slide']))
print(hipposummary)
cortexsummary = rp.summary_cont(data_frame['cortex'].groupby(data_frame['Slide']))
print(cortexsummary)
cerebsummary = rp.summary_cont(data_frame['cerebellum'].groupby(data_frame['Slide']))
print(cerebsummary)

hipresults = ols('hippocampus ~ C(Slide)', data=data_frame).fit()
hip_table = sm.stats.anova_lm(hipresults, typ=2)

cerresults = ols('cerebellum ~ C(Slide)', data=data_frame).fit()
cer_table = sm.stats.anova_lm(cerresults, typ=2)

cortresults = ols('cortex ~ C(Slide)', data=data_frame).fit()
cort_table = sm.stats.anova_lm(cortresults, typ=2)

print('HIPPOCAMPUS')
"""
import pandas as pd
import researchpy as rp
import seaborn as sns
import scipy.stats as stats

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

# maximizer
df_m = pandas.read_csv("https://raw.githubusercontent.com/impudding/Maximizer-Saticeficer/master/data/ad-questionaire-maximizer.csv")

print('\nMAXIMIZER\n')
# ad recognotion
ad_recog_data = rp.summary_cont(df_m.groupby(['size', 'RL', 'hashtag']))['ad_recog']
print('ad recognition\n')
display(ad_recog_data)
#ad_recog_data.to_csv('m_out.csv', sep='\t', encoding='utf-8')

model = ols('ad_recog ~ C(size)*C(RL)*C(hashtag)', df_m).fit()
# Seeing if the overall model is significant
print(f"\nOverall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")


# brand attitude
brand_data = rp.summary_cont(df_m.groupby(['size', 'RL', 'hashtag']))['brand']
print('brand attitude\n')
display(brand_data)
#brand_data.to_csv('m_out.csv', sep='\t', encoding='utf-8')
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, researchpy as rp, warnings
warnings.filterwarnings('ignore')  #Let's turn off all warning messages

my_df = pd.read_csv('ProjectReturnData.txt', delimiter=';')

print(my_df.head(5))
print(my_df.info())
print(my_df.describe())
print(my_df.shape)

# It looks like we have 5181 observations (examples) and 4 features (dimensions) in this dataset.
# It does not appear that we are missing any data
print(my_df.isnull().sum())

pd.set_option('display.max_columns', 10)
rp.summary_cont(my_df['Percent Return'].groupby(my_df['Leader Gender']))
rp.summary_cont(my_df['Percent Return'].groupby(my_df['Team Size']))
rp.summary_cont(my_df['Percent Return'].groupby(
    my_df['Aggregate Sales Experience']))

# Let's build a series of scatter plots to visualize our data
sns.pairplot(my_df,
             hue='Leader Gender',
             diag_kind='hist',
             kind='scatter',
             palette='husl')

# Let's map the genders to 0s and 1s....let's have the females be 1's and males be 0s
my_df['Leader Gender'] = my_df['Leader Gender'].map({'Male': 0, 'Female': 1})
print(my_df['Leader Gender'].head(5))
Exemplo n.º 9
0
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np
import pingouin as pg 
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import csv 
df = pd.read_csv("Datos.csv", index_col=None,usecols=[1,2,3,4,8],dtype={'generador': 'category',
                                                                        'algoritmo_flujo': 'category','vertices': 'category','aristas': 'category', 'mediana': np.float64} )
logX = np.log1p(df['mediana'])
df = df.assign(mediana_log=logX.values)
df.drop(['mediana'], axis= 1, inplace= True)
factores=["vertices","generador","aristas","algoritmo_flujo"]
plt.figure(figsize=(8, 6))
for i in factores:
    print(rp.summary_cont(df['mediana_log'].groupby(df[i])))
    anova = pg.anova (dv='mediana_log', between=i, data=df, detailed=True) 
    pg._export_table (anova,("ANOVA"+i+".csv"))    
    ax=sns.boxplot(x=df["mediana_log"], y=df[i], data=df, palette="Set1")
    plt.savefig("boxplot_"+ i+".png", bbox_inches='tight')
    plt.savefig("boxplot_" + i + ".eps", bbox_inches='tight')
    tukey = pairwise_tukeyhsd(endog = df["mediana_log"], groups= df[i], alpha=0.05)
    tukey.plot_simultaneous(xlabel='Time', ylabel=i)
    plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red")
    plt.savefig("simultaneous_tukey"+ i+".png", bbox_inches='tight')
    plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight')
    print(tukey.summary())
    t_csv = open("Tukey"+i+".csv", 'w')
    with t_csv:
        writer = csv.writer(t_csv)
        writer.writerows(tukey.summary())
def contentAnalysis(data, outcomeVar, independent):
    return rp.summary_cont(data[outcomeVar].groupby(data[independent]))
Exemplo n.º 11
0
# These are for exploring data
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt

# These are for running the model and conducting model diagnostics
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from scipy import stats
from statsmodels.compat import lzip

df = pd.read_csv('insurance.csv')

print("============================================")
# Let's get more information on the continuous varibles
print(rp.summary_cont(df[['charges', 'age', 'children']]))

print("\n===========================================")
# Let's get more information on the categorical data
print(rp.summary_cat(df[['sex', 'smoker', 'region']]))

df['sex'].replace({'female': 1, 'male': 0}, inplace=True)
df['smoker'].replace({'no': 0, 'yes': 1}, inplace=True)

df = pd.get_dummies(df)

print("\n===========================================")
print(df.head())

print("\n===========================================")
model = smf.ols(
def summerize(col, df):
    summ = rp.summary_cont(df.groupby(['g'])[col])
    display(summ)
Exemplo n.º 13
0
data_new.boxplot(column='Soil_pH', by='District')

import seaborn as sns

ax = sns.boxplot(x="District", y="Soil_pH", data=data_new)

!pip install researchpy

import researchpy as rp
import scipy.stats as stats
import statsmodels.api as sm

sns.violinplot(x="District", y="Soil_pH", data=data_new)

rp.summary_cont(data_new['Soil_pH'].groupby(data_new['District']))

stats.f_oneway(data_new['Soil_pH'][data_new['District'] == 'Kannur'],
             data_new['Soil_pH'][data_new['District'] == 'Kollam'],data_new['Soil_pH'][data_new['District'] == 'Kottayam'],data_new['Soil_pH'][data_new['District'] == 'Thrissur'])

from statsmodels.stats.multicomp import pairwise_tukeyhsd
tuckey=pairwise_tukeyhsd(endog=data_new['Soil_pH'],groups=data_new['District'],alpha=0.05)

tuckey.plot_simultaneous()
plt.vlines(x='District',ymin=-0.5,ymax=0.5,color='red')
tuckey.summary()

ax = sns.boxplot(x="District", y="Soil_Mg", data=data_new)

stats.f_oneway(data_new['Soil_Mg'][data_new['District'] == 'Kannur'],
             data_new['Soil_Mg'][data_new['District'] == 'Kollam'],data_new['Soil_Mg'][data_new['District'] == 'Kottayam'],data_new['Soil_Mg'][data_new['District'] == 'Thrissur'])
Exemplo n.º 14
0
    def plot_main_effects(self, intensity):

        if intensity[-1] != "%":
            intensity += "%"

        plt.subplots(3, 1, figsize=(12, 7))
        plt.suptitle("{} Activity (±95%CI)".format(intensity.capitalize()))

        # MODEL MEANS -------------------------------------------------------------------------------------------------
        plt.subplot(1, 3, 1)

        # n - 1
        t_crit = scipy.stats.t.ppf(.95,
                                   int(len(set(self.df_percent["ID"])) - 1))

        model_means = rp.summary_cont(self.df_percent.groupby(
            ['Model']))[intensity]["Mean"]
        model_ci = rp.summary_cont(self.df_percent.groupby(
            ['Model']))[intensity]["SE"] * t_crit

        plt.bar([i for i in model_means.index],
                [100 * i for i in model_means.values],
                yerr=[i * 100 for i in model_ci],
                capsize=10,
                ecolor='black',
                color=["White", "silver", "grey", "#404042"],
                edgecolor='black',
                linewidth=2)
        # color=["Red", "Blue", "Green", "Purple"]
        plt.ylabel("% of Collection")
        plt.title("Model Means")

        # ACTIVITY GROUPS ---------------------------------------------------------------------------------------------
        plt.subplot(1, 3, 2)

        group_means = rp.summary_cont(self.df_percent.groupby(
            ['Group']))[intensity]["Mean"]
        group_sd = rp.summary_cont(self.df_percent.groupby(
            ['Group']))[intensity]["SD"]
        plt.bar([i for i in group_means.index],
                [100 * i for i in group_means.values],
                yerr=[i * 100 for i in group_sd],
                capsize=10,
                ecolor='black',
                color=["Grey", "White"],
                edgecolor='black',
                linewidth=2)
        plt.title("Group Means")

        plt.subplot(1, 3, 3)
        sns.pointplot(data=self.df_percent,
                      x="Model",
                      y=intensity,
                      hue="Group",
                      markers=".",
                      scale=.8,
                      dodge=True,
                      capsize=.1,
                      errwidth=1,
                      palette='Set1')
        plt.title("All Combination Means")
        plt.ylabel(" ")
Exemplo n.º 15
0
main_df = main_df.drop('person', axis=1)

# Map 'dose' column values with string analogues
main_df['dose'] = main_df['dose'].map({1: 'placebo', 2: 'low', 3: 'high'})

display(main_df['dose'])

# In[6]:

display(rp.summary_cat(main_df['dose']))

display(rp.summary_cat(main_df['libido']))

# In[7]:

rp.summary_cont(main_df['libido'].groupby(main_df['dose']))

# In[8]:

# ANOVA example with scipy.stats

display(
    stats.f_oneway(
        main_df['libido'][main_df['dose'] == 'high'],  # sample1
        main_df['libido'][main_df['dose'] == 'low'],  # sample2
        main_df['libido'][main_df['dose'] == 'placebo']  # sample3
    ))

# In[9]:

# ANOVA with statsmodels
Exemplo n.º 16
0
# (i.e., make predictions from the model).

# Let's load the data into a Pandas DataFrame using the read_csv
my_df = pd.read_csv('EmpData.txt', delimiter='|', index_col=0)

print(my_df.head(5))
print(my_df.info())
print(my_df.describe())
print(my_df.shape)

# It looks like we have 5497 observations (examples) and 6 features (dimensions) in this dataset.
# Also notice how we are missing data in the currentsalary data column
print(my_df.isnull().sum())

pd.set_option('display.max_columns', 10)
rp.summary_cont(my_df['currentsalary'].groupby(my_df['flightrisk']))

# For this analysis, let's fill the missing values with the average values based on each flightrisk group
my_df1 = my_df.query('flightrisk=="high"')
my_avg1 = my_df1['currentsalary'].mean()
my_df1['currentsalary'].fillna(my_avg1, inplace=True)
print(my_df1)

my_df2 = my_df.query('flightrisk=="medium"')
my_avg2 = my_df2['currentsalary'].mean()
my_df2['currentsalary'].fillna(my_avg2, inplace=True)
print(my_df2)

my_df3 = my_df.query('flightrisk=="low"')
my_avg3 = my_df3['currentsalary'].mean()
my_df3['currentsalary'].fillna(my_avg3, inplace=True)
        sum = 0.0
        for name in names:
            sum = sum + float(df[name][index])
        avg = sum / nnames
        diff = avg - df[score][index]
        newset.append(diff)
    return newset


names = ['İno', 'K1', 'K3', 'K4', 'İ1']
score = 'Skor'
score_diff = construct(df, names, score)
distance = df['Skor']

dfimp = pd.DataFrame({'Skor': score_diff, 'K2': distance})
summary = rp.summary_cont(dfimp['Skor'].groupby(dfimp['K2']))
print(summary)
corr = dfimp.corr()
print(corr)
y = dfimp['Skor']
x = dfimp['K2']
plt.scatter(x, y)
plt.show()

few = 30
dfimp_last = dfimp[-few - 1:-1]
summary2 = rp.summary_cont(dfimp_last['Skor'].groupby(dfimp_last['K2']))
print(summary2)
corr2 = dfimp_last.corr()
print(corr2)
y = dfimp_last['Skor']
# The variable SUSPECT has 0's for the runs before John's "improvement" and 1's from the start
# Therefore it represents the presence of a new unknown factor, probably doping
# We should understand that the "difference in performances"
# is what we'd like to correlate with the suspected doping use

names = ['JOE', 'BILL', 'JACK']
john = 'JOHN'
john_diff = construct(df, names, john)
unknown = df['SUSPECT']

# in general
print(
    "In general, John's performance differs from the average of other athletes..."
)
dfimp = pd.DataFrame({'JOHN': john_diff, 'UNKNOWN': unknown})
summary = rp.summary_cont(dfimp['JOHN'].groupby(dfimp['UNKNOWN']))
print(summary)
corr = dfimp.corr()
print(corr)
y = dfimp['JOHN']
x = dfimp['UNKNOWN']
plt.scatter(x, y)
plt.show()

# How about last few runs?
few = 63  # Twice the size of suspected period added
print(
    "Last ", few,
    " runs, , John's performance differs from the average of other athletes..."
)
dfimp_last = dfimp[-few - 1:-1]
Exemplo n.º 19
0
# !conda install -c researchpy researchpy

import pandas
import researchpy as rp
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

df = pandas.read_csv(
    "https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/crop_yield.csv"
)
df.boxplot(column=['Yield'], grid=True)

rp.summary_cont(df['Yield'])
rp.summary_cont(df.groupby(['Fert']))['Yield']
rp.summary_cont(df.groupby(['Water']))['Yield']
rp.summary_cont(df.groupby(['Fert', 'Water']))['Yield']

# 2 way ANOVA
# Fits the model with the interaction term
# This will also automatically include the main effects for each factor
model = ols('Yield ~ C(Fert)*C(Water)', df).fit()

# Seeing if the overall model is significant
print(
    f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}"
)
model.summary()
Exemplo n.º 20
0
smmouth = df.loc[df.network == 'smmouth']
ventral = df.loc[df.network == 'ventral']
vis = df.loc[df.network == 'vis']

aud_w_baseline = pd.concat([baseline, aud])
cingulo_w_baseline = pd.concat([baseline, cingulo])
dmn_w_baseline = pd.concat([baseline, dmn])
dorsal_w_baseline = pd.concat([baseline, dorsal])
fronto_w_baseline = pd.concat([baseline, fronto])
retro_w_baseline = pd.concat([baseline, retro])
smhand_w_baseline = pd.concat([baseline, smhand])
smmouth_w_baseline = pd.concat([baseline, smmouth])
ventral_w_baseline = pd.concat([baseline, ventral])
vis_w_baseline = pd.concat([baseline, vis])

summary_aud = rp.summary_cont(aud_w_baseline.groupby('network'))
summary_cingulo = rp.summary_cont(cingulo_w_baseline.groupby('network'))
summary_dmn = rp.summary_cont(dmn_w_baseline.groupby('network'))
summary_dorsal = rp.summary_cont(dorsal_w_baseline.groupby('network'))
summary_fronto = rp.summary_cont(fronto_w_baseline.groupby('network'))
summary_retro = rp.summary_cont(retro_w_baseline.groupby('network'))
summary_smhand = rp.summary_cont(smhand_w_baseline.groupby('network'))
summary_smmouth = rp.summary_cont(smmouth_w_baseline.groupby('network'))
summary_ventral = rp.summary_cont(ventral_w_baseline.groupby('network'))
summary_vis = rp.summary_cont(aud_w_baseline.groupby('network'))

summary_aud2 = rp.summary_cont(
    aud_w_baseline.groupby(['network', 'awake', 'mild', 'deep', 'recovery']))
summary_cingulo2 = rp.summary_cont(
    cingulo_w_baseline.groupby(
        ['network', 'awake', 'mild', 'deep', 'recovery']))
Exemplo n.º 21
0
def descriptiveStatistics():
    df = checkEmptyValues()
    rp.summary_cont(df[["Üretim", "Tohum Fiyatı"]])
    rp.summary_cat(df[["EkildiğiAy", "Bölge", "ÜretimSüresi"]])
    df[["Üretim", "EkilenAlan"]].cov()
Exemplo n.º 22
0
print(my_contracts_df.head(5))
#I want to use the ContractPK column as the index instead of the 0,1,2,3,4,5, etc. that gets entered by default
my_contracts_df.set_index('ContractPK', inplace=True)
print(my_contracts_df.head(5))

pd.set_option('display.max_columns', 10)
#To describe the data, let's construct a correlation matrix to see how correlated our data are
corrMatrix = my_contracts_df.corr()
print(corrMatrix)
#For as many features as we have in these data, they don't seem to be excessively correlated.
sns.heatmap(corrMatrix, annot=True)
plt.show()

print(my_contracts_df.info())

rp.summary_cont(my_contracts_df['QuotedPrice'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['NumberofSocialMediaConnections'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['SizeOfSalesTeam'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['SalesTeamExperience'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['NumberPriorPurchases'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['CreditPercentage'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['InterestRate'].groupby(
    my_contracts_df['Status']))
rp.summary_cont(my_contracts_df['FinanceTermMonths'].groupby(
    my_contracts_df['Status']))
Exemplo n.º 23
0
    else:
        df.iat[column, 6] = 3
print(df["Excentricidad"])
df['Excentricidad'].replace({1: "baja", 2: 'media', 3: 'alta'}, inplace=True)
print(df["Excentricidad"])

logX = np.log1p(df['Mediana'])
df = df.assign(mediana_log=logX.values)
df.drop(['Mediana'], axis=1, inplace=True)

factores = [
    "Grado", "CoefAg", "CentCer", "CentCag", "Excentricidad", "PageRag"
]
plt.figure(figsize=(8, 6))
for i in factores:
    print(rp.summary_cont(df['FlujoMax'].groupby(df[i])))

    anova = pg.anova(
        dv='FlujoMax',
        between=i,
        data=df,
        detailed=True,
    )
    pg._export_table(anova, ("ANOVAsFlujoMax" + i + ".csv"))

    ax = sns.boxplot(x=df["FlujoMax"], y=df[i], data=df, palette="cubehelix")

    plt.savefig("boxplot_FlujoMax" + i + ".eps", bbox_inches='tight')
    tukey = pairwise_tukeyhsd(endog=df["FlujoMax"], groups=df[i], alpha=0.05)

    tukey.plot_simultaneous(xlabel='Flujo Maximo', ylabel=i)
Exemplo n.º 24
0
print(df['body_word_count'].median())
print(df['body_word_count'].mean())

norm_data = df['body_word_count']
print("Standard deviation of body word count is: ", end="")
print(norm_data.std())

print("The varianve of the body word count is: ", end="")
print(norm_data.var())

diff = df.body_word_count - df.body_unique_words
stats.probplot(diff, plot=plt)

stats.ttest_ind(df.body_word_count, df.body_unique_words)

rp.summary_cont(df.groupby('publish_time')['body_word_count'])

df.avg_word.mean()

df[['body_word_count', 'numerics']].plot(kind='box')
plt.show()

df['publish_time'].corr(df['body_word_count'], method='spearman')
df['publish_time'].corr(df['numerics'], method='spearman')
df['publish_time'].corr(df['stopwords'], method='spearman')
df['abstract_word_count'].corr(df['body_word_count'], method='spearman')
df['body_word_count'].corr(df['numerics'], method='spearman')
df['body_word_count'].corr(df['stopwords'], method='spearman')
df.stopwords.mean()
df.numerics.mean()
df.numerics.mode()
Exemplo n.º 25
0
import scipy.stats as stats
import matplotlib.pyplot as plt
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import numpy as np

from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

df = pd.read_csv(
    "Parameterized_dataset_OQ7E9S7KML_2019_3_18_5_18_58_44431.csv",
    index_col=None)
df.drop(['run', 'iterations'], axis=1, inplace=True)
print(rp.summary_cont(df['mean_exec_time']))

logX = np.log10(df['mean_exec_time'])
print(logX)
df = df.assign(mean_exec_time_log=logX.values)
print(df)
df.drop(['mean_exec_time'], axis=1, inplace=True)
print(df)

print(rp.summary_cont(df['mean_exec_time_log'].groupby(df['algorithm'])))

results = ols('mean_exec_time_log ~ C(algorithm)', data=df).fit()
print(results.summary())

aov_table = sm.stats.anova_lm(results, typ=2)
print(aov_table)
Exemplo n.º 26
0
def test_significance(df,
                      dependent_var,
                      *independent_vars,
                      formula=None,
                      logit_model=False,
                      correction_method='bonf',
                      anova_type=2):
    """
    Test the significance of independent vars on the dependent var and output
    the complete results of each step. This doesn't let us tune as many
    parameters as we might want to. (Don't use this generally)

    Args:
        df: DataFrame
        dependent_var: The name of the dependent variable column in df
        independent_vars: Array of independent variable columns in df
        formula (str): A formula relating the vars. If not specified, no
            interactions are assumed

    Returns:
        output (str) : A string to print the results of each test
        results (dict) : A dictionary of results corresponding to each test
    """
    ALPHA = 0.05  # Used for diagnostic tests

    output = ''
    results = {
        'multicollinearity': False,
        'homoskedastic': True,
        'normal_distribution': True,
    }

    # First add the summary data
    summary_df = rp.summary_cont(
        df.groupby(list(independent_vars))[dependent_var])
    summary_df['median'] = df.groupby(
        list(independent_vars))[dependent_var].median()
    output += f'Summary:\n{summary_df}\n\n'
    results['summary'] = summary_df

    # Get the OLS model formula
    if formula is None:
        formula = f"{dependent_var} ~ {' + '.join([f'C({v})' for v in independent_vars])} "

    # Then create the model and fit the data
    if not logit_model:
        model = smapi.ols(formula, data=df)
    else:
        # model = smapi.logit(formula, data=df)
        model = smapi.glm(formula, data=df, family=sm.families.Binomial())
    model_results = model.fit()
    output += f"{model_results.summary()}\n\n"
    results['initial'] = model_results

    # Check for normality
    if not logit_model:
        w, pvalue = spstats.shapiro(model_results.resid)
        output += f'Shapiro-Wilk test: {w, pvalue}\n\n'
        results['shapiro'] = (
            w,
            pvalue,
        )
        # if pvalue < 1e-4:
        if pvalue < ALPHA:
            output += 'NON NORMAL detected. Do something else\n\n'
            results['normal_distribution'] = False

    # Check for homoskedasticity based on the normality test
    if not logit_model:
        unique_values = df.groupby(
            list(independent_vars)).size().reset_index().rename(
                columns={0: 'count'})
        hs_test_data = []
        for row in unique_values.itertuples(index=False):
            if len(independent_vars) > 1:
                selectors = [(df[v] == getattr(row, v))
                             for v in independent_vars]
                row_selector = np.logical_and(*selectors[:2])
                if len(independent_vars) > 2:
                    row_selector = np.logical_and(row_selector, selectors[2])
            else:
                v = independent_vars[0]
                row_selector = df[v] == getattr(row, v)
            hs_test_data.append(df.loc[row_selector, dependent_var])
        assert len(hs_test_data) == unique_values.shape[0]

        if results['normal_distribution']:
            w, pvalue = spstats.bartlett(*hs_test_data)
            output += f'Bartlett test: {w, pvalue}\n\n'
            results['bartlett'] = (
                w,
                pvalue,
            )
        else:
            w, pvalue = spstats.levene(*hs_test_data)
            output += f'Levene test: {w, pvalue}\n\n'
            results['levene'] = (
                w,
                pvalue,
            )
        if pvalue < ALPHA:
            output += 'HETEROSKEDASTICITY detected. Do something else\n\n'
            results['homoskedastic'] = False

        # Check that the condition number is reasonable
        if model_results.diagn['condno'] > 20:
            output += f'MULTICOLLINEARITY detected. Do something else\n\n'
            results['multicollinearity'] = True

    # If we are normal, non-multicollinear, and homoskedastic, perform ANOVA
    # and then multiple comparisons using Tukey's HSD. If heteroskedastic, then
    # we should use robust regression. Else, use a non-parametric test

    # TODO: Perhaps we should look into using the Wald test instead?
    # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.wald_test.html
    if results['normal_distribution'] and results[
            'homoskedastic'] and not logit_model:
        o, r = test_using_anova(model,
                                model_results,
                                True,
                                df,
                                dependent_var,
                                *independent_vars,
                                anova_type=anova_type)
        output += o
        results.update(r)

    elif results['normal_distribution'] and not logit_model:
        model = smapi.rlm(formula, data=df)
        rlm_results = model.fit()
        output += f"{rlm_results.summary()}\n\n"
        results['rlm'] = rlm_results

        o, r = test_using_anova(model,
                                rlm_results,
                                False,
                                df,
                                dependent_var,
                                *independent_vars,
                                anova_type=anova_type)
        output += o
        results.update(r)

    elif not logit_model:
        o, r = test_using_kruskal(df,
                                  dependent_var,
                                  *independent_vars,
                                  correction_method=correction_method)
        output += o
        results.update(r)

    # Return the outputs
    return output, results
Exemplo n.º 27
0
Date creation: Jan-3-2020 
Description:   This program runs a two factor ANOVA on the data set contained in the file RI.csv.
'''

import pandas
import researchpy as rp

import statsmodels.api as sm
from statsmodels.formula.api import ols 
import statsmodels.stats.multicomp

#Imports data from csv file
df = pandas.read_csv('RI.csv')

#Summary of the RI (Grand Mean)
sum_RI = rp.summary_cont(df['RI'])
print('\n--Overall summary:\n')
print(sum_RI)

#Summary of RI ordered by Genotype and Concentration
sum_RI_con = rp.summary_cont(df.groupby(['Genotype', 'Concentration']))['RI']
print('\n--Overall summary by groups:\n')
print(sum_RI_con)

#Fits the regression model. We include the interaction of Genotype and Concentration
model = ols('RI ~ C(Genotype) * C(Concentration) ', df).fit()

#Shows if the overall model is statistically significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")

#Summary of the model
Exemplo n.º 28
0
print("                   ")

print("DATASET SCORES BY METHODS:")

print("            ")

dg = pd.DataFrame(index=range(7),
                  columns=['Method A', 'Method B', 'Method C', 'Method D'])
dfe = df.values.reshape(4, 7, 2)
dfee = dfe[:, :, 1]
for j in range(4):
    dg[dg.columns[j]] = dfee[j]
print(dg)

#df['Method'].replace({1:"Method A", 2: "Method B",3:"Method C", 4:"Method D"},inplace=True)
print(rp.summary_cont(df['Scores']))

print("   ")
print("Descritive Statistics for outcome variable DV")

print("   ")
print(rp.summary_cont(df['Scores'].groupby(df['Method'])))
print("     ")
print("ASSUMPTIONS FOR ANOVA TEST")
# INDEPENDENCE
print("       ")
print("INDEPENDENCE")
print("              ")
print("It is Assumed due to the statement ")
print("                 ")
#NORMALITY
Exemplo n.º 29
0
df = pd.read_csv("../data/all_annotated.csv", parse_dates=['publish_date'])

tracker = open('../outputs/stats.csv', "w", newline="")

#summary stats
tracker.write("Statistics\r\n")

#HYPOTHESIS 1 Kashmir over Pakistan
tracker.write(
    "\nHYPOTHESIS 1: Kashmir-related headlines will have more negative sentiment scores on average than non-Kashmir-related  in any given year\r\n"
)
levene = stats.levene(df['total_score'][df['is_kashmir'] == True],
                      df['total_score'][df['is_kashmir'] == False])
tracker.write("Variance is equal: %r, %s\n" % ((levene[1] > .05), levene))

rp.summary_cont(df.groupby(['year'])['total_score']).to_csv(tracker, mode="a")
rp.summary_cont(df.groupby(['is_kashmir'])['total_score']).to_csv(tracker,
                                                                  mode="a")
tracker.write("Summary by year and relation to Kashmir\n")
rp.summary_cont(df.groupby(['is_kashmir',
                            'year'])['total_score']).to_csv(tracker, mode="a")
model = ols('total_score ~ C(year)*C(is_kashmir)', df).fit()
# Seeing if the overall model is significant
tracker.write("\nSeeing if overall model is significant\n")
tracker.write(
    f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}\n"
)
tracker.write(str(model.summary()))

tracker.write("\nTwo-way ANOVA\n")
res = sm.stats.anova_lm(model, typ=2)
Exemplo n.º 30
0
import pandas as pd
import researchpy as rp

# From https://www.pythonfordatascience.org/anova-python/

df = pd.read_csv("../datasets/difficile.csv")
df.drop('person', axis=1, inplace=True)

# Recoding value from numeric to string
df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace=True)

df.info()

rp.summary_cont(df['libido'])
rp.summary_cont(df['libido'].groupby(df['dose']))