Exemplo n.º 1
0
def test_statsmodel_to_df():

    amplitude = 1.432

    df_cha = pd.DataFrame()
    for n in range(5):

        raw = simulate_nirs_raw(sfreq=3.,
                                amplitude=amplitude,
                                sig_dur=300.,
                                stim_dur=5.,
                                isi_min=15.,
                                isi_max=45.)
        design_matrix = make_first_level_design_matrix(raw, stim_dur=5.0)
        glm_est = run_GLM(raw, design_matrix)
        cha = glm_to_tidy(raw, glm_est, design_matrix)
        cha["ID"] = '%02d' % n
        df_cha = df_cha.append(cha)
    df_cha["theta"] = df_cha["theta"] * 1.0e6
    roi_model = smf.mixedlm("theta ~ -1 + Condition",
                            df_cha,
                            groups=df_cha["ID"]).fit(method='nm')
    df = statsmodels_to_results(roi_model)
    assert type(df) == pd.DataFrame
    assert df["Coef."]["Condition[A]"] == amplitude
    assert df["Significant"]["Condition[A]"]
    assert df.shape == (8, 8)

    roi_model = smf.rlm("theta ~ -1 + Condition", df_cha,
                        groups=df_cha["ID"]).fit()
    df = statsmodels_to_results(roi_model)
    assert type(df) == pd.DataFrame
    assert df["Coef."]["Condition[A]"] == amplitude
    assert df["Significant"]["Condition[A]"]
    assert df.shape == (8, 8)
Exemplo n.º 2
0
def linearRegression(segmentedValues):
	print("Linear regression")
	#regression = LinearRegression()
	linRegress = dict()
	for key in segmentedValues.keys():
		x = [x[0] for x in segmentedValues[key]]
		y = [x[1] for x in segmentedValues[key]]
		mean = [float(np.average(x)),float(np.average(y))]
		valuesDict = dict()
		valuesDict['x'] = x
		valuesDict['y'] = y
		valuesFrame = pd.DataFrame(valuesDict)
		try:
			rlmRes = sm.rlm(formula = 'y ~ x', data=valuesFrame).fit()
		except ZeroDivisionError:
			#I have no idea why this occurs. A problem with statsmodel
			#Return None
			print("divide by zero :( ")
			return None
		#Caclulate r2_score (unfortunately, rlm does not give this to us)
		x = np.array(x)
		y = np.array(y)
		#Get the predicted values of Y
		y_pred = x*rlmRes.params.x+rlmRes.params.Intercept
		score = r2_score(y, y_pred)
		#These should both be positive -- put in abs anyway
		slopeConfInterval = abs(float(rlmRes.params.x) - float(rlmRes.conf_int(.005)[0].x))
		intConfInterval = abs(float(rlmRes.params.Intercept) - float(rlmRes.conf_int(.005)[0].Intercept))
		#Slope, Intercept, R^2, num of values, confidenceIntervals, mean of cluster
		linRegress[key] = [rlmRes.params.x, rlmRes.params.Intercept, score, len(x), [slopeConfInterval, intConfInterval], mean]
		print("Key: "+str(key)+" Slope: "+str(rlmRes.params.x)+" Intercept: "+str(rlmRes.params.Intercept)+"R2 Score: "+str(score)+" Num vals: "+str(len(x))+" confidence: "+str(slopeConfInterval)+", "+str(intConfInterval)+" mean: "+str(mean))
	return linRegress
Exemplo n.º 3
0
def report_rlm(formula, data, verbose=True, **kwargs):
    """Fit RLM, print a report, and return the fit object."""
    results = smf.rlm(formula, data=data, **kwargs).fit(**kwargs)
    summary = results.summary()

    if verbose:
        report = """\n{summary}\n""".format(summary=summary)
        print(report)

    return results
Exemplo n.º 4
0
def rolling_ols(formula: str,
                data: pd.DataFrame,
                window: int,
                r2_adj=False,
                expanding=False,
                robust=False,
                M=sm.robust.norms.AndrewWave()):

    para_res = {}
    r_2_res = {}
    model_sig = {}
    forcast_res = pd.Series([])

    for i in range(len(data) - window + 1):

        if expanding:
            start_index = 0
        else:
            start_index = i

        tmp_df = data.iloc[start_index:i + window]
        forcast_x = data.iloc[i + window:i + window + 1]

        if robust:
            rlm_model = smf.rlm(formula, data=tmp_df, M=M)
            ols_result = smf.wls(formula,
                                 data=tmp_df,
                                 weights=rlm_model.fit().weights).fit()
            # ols_result = sm.WLS(rlm_model.endog, rlm_model.exog,
            #                     weights=rlm_model.fit().weights).fit()
        else:
            ols_result = smf.ols(formula, data=tmp_df).fit()

        para_res[data.index[i + window - 1]] = ols_result.params
        model_sig[data.index[i + window - 1]] = ols_result.f_pvalue

        if r2_adj:
            r_2_res[data.index[i + window - 1]] = ols_result.rsquared_adj
        else:
            r_2_res[data.index[i + window - 1]] = ols_result.rsquared

        # 一步预测
        forcast_res = forcast_res.append(ols_result.predict(forcast_x))

    para_res = pd.DataFrame(para_res).T
    r_2_res = pd.Series(r_2_res)
    model_sig = pd.Series(model_sig)

    return para_res, r_2_res.mean(), model_sig, forcast_res
def view_Analysis(model_type: model_type, headers_dependent: headers_dependent,
                  headers_factor: headers_factor,
                  headers_groups: headers_groups,
                  analysis_formula: analysis_formula):

    data = df

    mdl_string = 'noInput'

    if analysis_formula != '':
        mdl_string = analysis_formula
    else:
        if headers_dependent != 'Select' and headers_factor != 'Select':
            mdl_string = headers_dependent + ' ~ ' + headers_factor

    if mdl_string != 'noInput':

        if analysis_formula != '':
            mdl_string = analysis_formula
        else:
            mdl_string = headers_dependent + ' ~ ' + headers_factor

        if model_type == 'Ordinary Least Squares':
            model = ols(mdl_string, data).fit()
        elif model_type == 'Generalized Linear Models':
            model = glm(mdl_string, data, family=sm.families.Gamma()).fit()
        elif model_type == 'Robust Linear Models':
            model = rlm(mdl_string, data, M=sm.robust.norms.HuberT()).fit()
        elif model_type == 'Linear Mixed Effects Models':
            if headers_groups != 'Select':
                model = mixedlm(mdl_string, data,
                                groups=data[headers_groups]).fit()
        elif model_type == 'Discrete - Regression with binary - Logit':
            model = Logit(data[headers_dependent],
                          data[headers_factor].astype(float)).fit()
        elif model_type == 'Discrete - Regression with binary - Probit':
            model = Probit(data[headers_dependent],
                           data[headers_factor].astype(float)).fit()
        elif model_type == 'Discrete - Regression with nominal - MNLogit':
            y = data[headers_factor]
            x = sm.add_constant(data[headers_dependent], prepend=False)
            model = sm.MNLogit(y, x).fit()
        elif model_type == 'Discrete - Regression with count - Poisson':
            model = Poisson(data[headers_dependent],
                            data[headers_factor].astype(float)).fit()

        display(model.summary())
Exemplo n.º 6
0
def runModel(experiment,
             data,
             dependentVariable,
             independentVariables,
             regressionType='ols'):
    import statsmodels.formula.api as smf
    modelStr = modelString(experiment, dependentVariable, independentVariables)
    if regressionType == 'ols':
        model = smf.ols(modelStr, data=data)
    elif regressionType == 'gls':
        model = smf.gls(modelStr, data=data)
    elif regressionType == 'rlm':
        model = smf.rlm(modelStr, data=data)
    else:
        print('Unknown regression type {}. Exiting'.format(regressionType))
        import sys
        sys.exit()
    return model.fit()
Exemplo n.º 7
0
def rlm_formula(data, xseq, **params):
    """
    Fit RLM using a formula
    """
    eval_env = params['enviroment']
    formula = params['formula']
    init_kwargs, fit_kwargs = separate_method_kwargs(params['method_args'],
                                                     sm.RLM, sm.RLM.fit)
    model = smf.rlm(formula, data, eval_env=eval_env, **init_kwargs)
    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        warnings.warn(
            "Confidence intervals are not yet implemented"
            "for RLM smoothing.", PlotnineWarning)

    return data
Exemplo n.º 8
0
def test_statsmodel_to_df(func):
    func = getattr(smf, func)
    np.random.seed(0)

    amplitude = 1.432

    df_cha = pd.DataFrame()
    for n in range(5):

        raw = simulate_nirs_raw(sfreq=3.,
                                amplitude=amplitude,
                                sig_dur=300.,
                                stim_dur=5.,
                                isi_min=15.,
                                isi_max=45.)
        raw._data += np.random.normal(0, np.sqrt(1e-12), raw._data.shape)
        design_matrix = make_first_level_design_matrix(raw, stim_dur=5.0)
        glm_est = run_glm(raw, design_matrix)
        with pytest.warns(RuntimeWarning, match='Non standard source detect'):
            cha = glm_est.to_dataframe()
        cha["ID"] = '%02d' % n
        df_cha = pd.concat([df_cha, cha], ignore_index=True)
    df_cha["theta"] = df_cha["theta"] * 1.0e6
    roi_model = func("theta ~ -1 + Condition", df_cha,
                     groups=df_cha["ID"]).fit()
    df = statsmodels_to_results(roi_model)
    assert type(df) == pd.DataFrame
    assert_allclose(df["Coef."]["Condition[A]"], amplitude, rtol=0.1)
    assert df["Significant"]["Condition[A]"]
    assert df.shape == (8, 8)

    roi_model = smf.rlm("theta ~ -1 + Condition", df_cha,
                        groups=df_cha["ID"]).fit()
    df = statsmodels_to_results(roi_model)
    assert type(df) == pd.DataFrame
    assert_allclose(df["Coef."]["Condition[A]"], amplitude, rtol=0.1)
    assert df["Significant"]["Condition[A]"]
    assert df.shape == (8, 8)
#### Influence Plot

fig, ax = plt.subplots(figsize=(8, 6))
fig = sm.graphics.influence_plot(crime_model, ax=ax)


#### Using robust regression to correct for outliers.

# Part of the problem here in recreating the Stata results is that M-estimators are not robust to leverage points. MM-estimators should do better with this examples.

from statsmodels.formula.api import rlm


rob_crime_model = rlm("murder ~ urban + poverty + hs_grad + single", data=dta, M=sm.robust.norms.TukeyBiweight(3)).fit(
    conv="weights"
)
print rob_crime_model.summary()


# rob_crime_model = rlm("murder ~ pctmetro + poverty + pcths + single", data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights")
# print rob_crime_model.summary()


# There aren't yet an influence diagnostics as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808))

weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx]
ww = weights[idx] / weights[idx].mean()
hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1)
        if not output[group1][yr][outcome]: continue
            
        x_means.append(yr)        
        for i in range(len(output[group1][yr][outcome])):
            x.append(yr)
            y.append(output[group2][yr][outcome][i]-output[group1][yr][outcome][i])
        y_means.append(np.mean(y))
    '''
    '''
    errorbar(output.index, yMeans, yerr=yErr, fmt='o-')
    result = smf.ols(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit()    
    #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() 
    plot(years, np.array(years)*result.params[1] + result.params[0], 'r--')
    print 'slope:'+str(result.params[1])+', '+str(result.pvalues[1])
    '''

    
    title(outcome)
    xlabel('Year article was published')    
#    plot(x, y, 'x', x_means, y_means, 'o')
    plot(output.index, yMeans, 'o', alpha=0.9)
    plot(Xs, Ys, 'x', alpha=0.7)    
    result = smf.rlm(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit()    
    #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() 
    plot(years, np.array(years)*result.params[1] + result.params[0], 'r--')
    figtext(0.6, 0.8, 'slope:'+str(np.around(result.params[1],4))+', p='+str(np.around(result.pvalues[1],3)))
    figtext(0.55, 0.75, 'green=raw data, blue=means')
    print 'intercept:'+str(result.params[0])+', '+str(result.pvalues[0])
    show()
    
    #raw_input('..')
Exemplo n.º 11
0
def test_significance(df,
                      dependent_var,
                      *independent_vars,
                      formula=None,
                      logit_model=False,
                      correction_method='bonf',
                      anova_type=2):
    """
    Test the significance of independent vars on the dependent var and output
    the complete results of each step. This doesn't let us tune as many
    parameters as we might want to. (Don't use this generally)

    Args:
        df: DataFrame
        dependent_var: The name of the dependent variable column in df
        independent_vars: Array of independent variable columns in df
        formula (str): A formula relating the vars. If not specified, no
            interactions are assumed

    Returns:
        output (str) : A string to print the results of each test
        results (dict) : A dictionary of results corresponding to each test
    """
    ALPHA = 0.05  # Used for diagnostic tests

    output = ''
    results = {
        'multicollinearity': False,
        'homoskedastic': True,
        'normal_distribution': True,
    }

    # First add the summary data
    summary_df = rp.summary_cont(
        df.groupby(list(independent_vars))[dependent_var])
    summary_df['median'] = df.groupby(
        list(independent_vars))[dependent_var].median()
    output += f'Summary:\n{summary_df}\n\n'
    results['summary'] = summary_df

    # Get the OLS model formula
    if formula is None:
        formula = f"{dependent_var} ~ {' + '.join([f'C({v})' for v in independent_vars])} "

    # Then create the model and fit the data
    if not logit_model:
        model = smapi.ols(formula, data=df)
    else:
        # model = smapi.logit(formula, data=df)
        model = smapi.glm(formula, data=df, family=sm.families.Binomial())
    model_results = model.fit()
    output += f"{model_results.summary()}\n\n"
    results['initial'] = model_results

    # Check for normality
    if not logit_model:
        w, pvalue = spstats.shapiro(model_results.resid)
        output += f'Shapiro-Wilk test: {w, pvalue}\n\n'
        results['shapiro'] = (
            w,
            pvalue,
        )
        # if pvalue < 1e-4:
        if pvalue < ALPHA:
            output += 'NON NORMAL detected. Do something else\n\n'
            results['normal_distribution'] = False

    # Check for homoskedasticity based on the normality test
    if not logit_model:
        unique_values = df.groupby(
            list(independent_vars)).size().reset_index().rename(
                columns={0: 'count'})
        hs_test_data = []
        for row in unique_values.itertuples(index=False):
            if len(independent_vars) > 1:
                selectors = [(df[v] == getattr(row, v))
                             for v in independent_vars]
                row_selector = np.logical_and(*selectors[:2])
                if len(independent_vars) > 2:
                    row_selector = np.logical_and(row_selector, selectors[2])
            else:
                v = independent_vars[0]
                row_selector = df[v] == getattr(row, v)
            hs_test_data.append(df.loc[row_selector, dependent_var])
        assert len(hs_test_data) == unique_values.shape[0]

        if results['normal_distribution']:
            w, pvalue = spstats.bartlett(*hs_test_data)
            output += f'Bartlett test: {w, pvalue}\n\n'
            results['bartlett'] = (
                w,
                pvalue,
            )
        else:
            w, pvalue = spstats.levene(*hs_test_data)
            output += f'Levene test: {w, pvalue}\n\n'
            results['levene'] = (
                w,
                pvalue,
            )
        if pvalue < ALPHA:
            output += 'HETEROSKEDASTICITY detected. Do something else\n\n'
            results['homoskedastic'] = False

        # Check that the condition number is reasonable
        if model_results.diagn['condno'] > 20:
            output += f'MULTICOLLINEARITY detected. Do something else\n\n'
            results['multicollinearity'] = True

    # If we are normal, non-multicollinear, and homoskedastic, perform ANOVA
    # and then multiple comparisons using Tukey's HSD. If heteroskedastic, then
    # we should use robust regression. Else, use a non-parametric test

    # TODO: Perhaps we should look into using the Wald test instead?
    # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.wald_test.html
    if results['normal_distribution'] and results[
            'homoskedastic'] and not logit_model:
        o, r = test_using_anova(model,
                                model_results,
                                True,
                                df,
                                dependent_var,
                                *independent_vars,
                                anova_type=anova_type)
        output += o
        results.update(r)

    elif results['normal_distribution'] and not logit_model:
        model = smapi.rlm(formula, data=df)
        rlm_results = model.fit()
        output += f"{rlm_results.summary()}\n\n"
        results['rlm'] = rlm_results

        o, r = test_using_anova(model,
                                rlm_results,
                                False,
                                df,
                                dependent_var,
                                *independent_vars,
                                anova_type=anova_type)
        output += o
        results.update(r)

    elif not logit_model:
        o, r = test_using_kruskal(df,
                                  dependent_var,
                                  *independent_vars,
                                  correction_method=correction_method)
        output += o
        results.update(r)

    # Return the outputs
    return output, results
Exemplo n.º 12
0
print(result_qr.conf_int(0.68))

# Covariance of fit parameters
pcov = result_qr.cov_params()

# inverse standard deviation of fit parameters
pcor = np.diag(1 / np.sqrt(np.diag(pcov)))

print('\n*Covariance* of fit parameters')
print(result_qr.cov_params())

print('\n*Cross correlation* of fit parameters')
print(result_qr.cov_params(pcor))

print('***RLM REGRESSION***')
# Create regression model
model_rlm = smf.rlm('y ~ 1 + t + np.square(t)',
                    data,
                    M=sm.robust.norms.TukeyBiweight())

# Do the regression fit
result_rlm = model_rlm.fit()

# Display data and best fit
plt.clf()
plt.plot(t, y, 'o', label='data')
plt.plot(t, model.predict(result.params), label='OLS')
plt.plot(t, model_qr.predict(result_qr.params), label='QuantReg')
plt.plot(t, model_rlm.predict(result_rlm.params), label='RLM')
plt.legend(loc='lower left')
            x.append(j)
            y.append(diffy)    
    
    
    '''
    ax = fig.add_subplot(8, 1, i)
#    ax.plot(x, y, '.', alpha=0.99)
    ax.plot(years, means, '.', alpha=0.99)
    if i==0:plt.xlabel('Years after last GSS wave used')
   
    #ax.text(0.1, i, outcome, bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})  

    formula = outcome+'~years'
    result = smf.ols(formula, data=pd.DataFrame({'years':x, outcome:y}).dropna(axis=0), missing='drop').fit()
    results[outcome] = result    
    ax.plot(years, np.array(years)*result.params[1] + result.params[0], 'r--')
#    fig.savefig('test.png', dpi=100)
    '''
    plot(years, means, '.', alpha=0.8)
    xlim((-1,43))
    xlabel('Years after publication')
    ylabel('Change in ' + outcomeMap[outcome])
    title(outcomeMap[outcome] + ' Over Time')
    formula = outcome+'~years'
    result = smf.rlm(formula, data=pd.DataFrame({'years':x, outcome:y}).dropna(axis=0), missing='drop').fit()
    plot(years, np.array(years)*result.params[1] + result.params[0], 'r--')
    figtext(0.2, 0.3, 'slope:'+str(np.around(result.params[1],4))+', p='+str(np.around(result.pvalues[1],3)))
#    figtext(0.6, 0.75, 'blue dot = model from an article')
    print 'intercept:'+str(result.params[0])+', '+str(result.pvalues[0])

    show()
# ### Influence Plot

fig, ax = plt.subplots(figsize=(8, 6))
fig = sm.graphics.influence_plot(crime_model, ax=ax)

# ### Using robust regression to correct for outliers.

# Part of the problem here in recreating the Stata results is that
# M-estimators are not robust to leverage points. MM-estimators should do
# better with this examples.

from statsmodels.formula.api import rlm

rob_crime_model = rlm("murder ~ urban + poverty + hs_grad + single",
                      data=dta,
                      M=sm.robust.norms.TukeyBiweight(3)).fit(conv="weights")
print(rob_crime_model.summary())

#rob_crime_model = rlm("murder ~ pctmetro + poverty + pcths + single",
# data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights")
#print(rob_crime_model.summary())

# There isn't yet an influence diagnostics method as part of RLM, but we
# can recreate them. (This depends on the status of [issue
# #888](https://github.com/statsmodels/statsmodels/issues/808))

weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx.values]
ww = weights[idx] / weights[idx].mean()
ax_21 = plt.Subplot(f, gs02[1, 1])
f.add_subplot(ax_21)
swarm_boxplot(ax_21, model_exp2, 'd', ' ', 2)
ax_21.set_ylabel('Bucket bias\n(Bayesian model)')

# ---------------------------------------------------------------------------------------
# 12. Plot robust linear regression of perseveration probability on bucket-shift parameter
# ---------------------------------------------------------------------------------------

# Data frame for regression model
data = pd.DataFrame()
data['pers'] = pers_noPush['pers'].copy()
data['d'] = model_exp2['d'].copy()

# Robust linear regression
mod = smf.rlm(formula='d ~ pers', M=sm.robust.norms.TukeyBiweight(3), data=data)
res = mod.fit(conv="weights")
print(res.summary())

# Plot results
ax_22 = plt.Subplot(f, gs02[1, 2])
f.add_subplot(ax_22)
x = pers_noPush['pers'].copy()
y = model_exp2['d'].copy()
ax_22.plot(x, y, '.', color='gray', alpha=0.7, markersize=2)
ax_22.plot(x, res.fittedvalues, '-', label="RLM", color="k")
ax_22.set_ylabel('Bucket bias\n(Bayesian model)')
ax_22.set_xlabel('Estimated\nperseveration probability')
ax_22.set_xticks(np.arange(0, 1, 0.2))

# --------------------------------------
Exemplo n.º 16
0
def test_missing():
    # see GH#2083
    import statsmodels.formula.api as smf

    d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan]}
    smf.rlm('Foo ~ Bar', data=d)
            x.append(yr)
            y.append(output[group2][yr][outcome][i]-output[group1][yr][outcome][i])
        y_means.append(np.mean(y))
    '''
    '''
    errorbar(output.index, yMeans, yerr=yErr, fmt='o-')
    result = smf.ols(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit()    
    #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() 
    plot(years, np.array(years)*result.params[1] + result.params[0], 'r--')
    print 'slope:'+str(result.params[1])+', '+str(result.pvalues[1])
    '''

    
    title(outcome)
    xlabel('Year article was published')    
#    plot(x, y, 'x', x_means, y_means, 'o')
    plot(outputRandom.index, yMeansRandom, 'ro', alpha=0.9)
    plot(outputActual.index, yMeansActual, 'go', alpha=0.9)
    #plot(Xs, Ys, 'x', alpha=0.7)    
    resultRandom = smf.rlm(data=pd.DataFrame({'y':YsRandom, 'x':np.array(XsRandom)-1973}), formula='y~x').fit()    
    resultActual = smf.rlm(data=pd.DataFrame({'y':YsActual, 'x':np.array(XsActual)-1973}), formula='y~x').fit()    

    #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() 
    plot(years, (np.array(years)-1973)*resultRandom.params[1] + resultRandom.params[0], 'r--')
    plot(years, (np.array(years)-1973)*resultActual.params[1] + resultActual.params[0], 'g--')

    figtext(0.45, 0.75, 'Red: random slope:'+str(np.around(resultRandom.params[1],4))+', p='+str(np.around(resultRandom.pvalues[1],2)))
    figtext(0.45, 0.8, 'Green: actual slope:'+str(np.around(resultActual.params[1],4))+', p='+str(np.around(resultActual.pvalues[1],2)))
    figtext(0.15, 0.75, 'random int.:'+str(np.around(resultRandom.params[0],2))+', p='+str(np.around(resultRandom.pvalues[0], 2)))
    figtext(0.15, 0.8, 'actual int.:'+str(np.around(resultActual.params[0],2))+', p='+str(np.around(resultActual.pvalues[0], 2)))
    show()
Exemplo n.º 18
0
ols_model = ols('prestige ~ income + education', prestige).fit()
print(ols_model.summary())

print("######################")
print("Built in OLS")
print("######################")
# now get the robust estimate using huber
params, resids, squareResid, rank, s = olsModel(predictor, obs, intercept=True)
print(params)

print("######################")
print("Checking M-estimate")
print("######################")
rlm_model = rlm('prestige ~ income + education',
                prestige,
                M=sm.robust.norms.HuberT(t=1.345)).fit()
print(rlm_model.summary())
print("######################")
print("Built in M-estimate")
print("######################")
params, resids, scale, weights = mestimateModel(predictor,
                                                obs,
                                                weights="huber",
                                                intercept=True)
print(params)

print("######################")
print("Checking MAD")
print("######################")
np.random.seed(12345)
Exemplo n.º 19
0
    def agesex_adjust(self, df, sig_level=0.01):

        # Grab the participant data
        prt_data = self.participants

        for column in df.columns:

            # Merge the column with participant data
            sub = df.loc[:, [column]].merge(prt_data,
                                            left_index=True,
                                            right_on='username').dropna()
            sub.columns = ['value', 'username', 'gender', 'age', 'ancestry']

            if (sub.shape[0] < 20):
                continue

            model = smf.rlm(formula='value~age',
                            data=sub,
                            M=statsmodels.robust.norms.TrimmedMean())
            res = model.fit()

            if (res.pvalues['age'] < sig_level):

                #print "corrected age", column, res.pvalues['age']
                temp = pandas.DataFrame(res.resid, columns=[column])

                ## Do partial regression
                #temp = pandas.DataFrame(smf.ols(formula='value~age', data=sub).fit().resid, columns=[column])

                # Set the index
                temp.index = sub['username']

                # Update the original dataframe
                df.update(temp)

            # Merge the column with participant data
            sub = df.loc[:, [column]].merge(prt_data,
                                            left_index=True,
                                            right_on='username').dropna()

            sub.columns = ['value', 'username', 'gender', 'age', 'ancestry']

            model = smf.rlm(formula='value~C(gender)',
                            data=sub,
                            M=statsmodels.robust.norms.TrimmedMean())
            res = model.fit()

            if (res.pvalues['C(gender)[T.M]'] < sig_level):

                temp = pandas.DataFrame(res.resid, columns=[column])

                ## Do partial regression
                #temp = pandas.DataFrame(smf.ols(formula='value~C(gender)', data=sub).fit().resid, columns=[column])

                # Set the index
                temp.index = sub['username']

                # Update the original dataframe
                df.update(temp)

        return df
Exemplo n.º 20
0
# Data frame for regerssion model
data = pd.DataFrame()
data['pers'] = pers_noPush['pers'].copy()
data['d'] = model_exp2['d'].copy()
data['age_group'] = pers_noPush['age_group'].copy()

# Recode age dummy variable in reverse order, i.e., with older adults as reference because they seem to
# have the strongest effect
data.loc[data['age_group'] == 3, 'age_group'] = 2  # YA in the middle
data.loc[data['age_group'] == 1, 'age_group'] = 3  # CH last variable
data.loc[data['age_group'] == 4, 'age_group'] = 1  # OA reference

# Robust linear regression
mod = smf.rlm(
    formula=
    'd ~ pers + C(age_group, Treatment) + pers * C(age_group, Treatment)',
    M=sm.robust.norms.TukeyBiweight(3),
    data=data)
res = mod.fit(conv="weights")
print(res.summary())

# Plot results
plt.plot(pers_noPush[pers_noPush['age_group'] == 1]['pers'].copy(),
         model_exp2[pers_noPush['age_group'] == 1]['d'].copy(),
         '.',
         color=colors[0],
         alpha=1,
         markersize=5)
plt.plot(pers_noPush[pers_noPush['age_group'] == 3]['pers'].copy(),
         model_exp2[pers_noPush['age_group'] == 3]['d'].copy(),
         '.',
Exemplo n.º 21
0
    def delta_transform_agesex_adjust(self, sig_level=0.01):
        if self.type in ['GENOM', 'COACH']:
            return self.GetDataFrame()

        df = self.GetDataFrame()

        # Grab the participant data
        prt_data = self.participants
        # Build prt_data dataframe for multiple rounds
        prt_data1 = prt_data.copy()
        prt_data2 = prt_data.copy()
        prt_data3 = prt_data.copy()

        prt_data1['username'] = [
            x + '_1' for x in prt_data1['username'].tolist()
        ]
        prt_data2['username'] = [
            x + '_2' for x in prt_data2['username'].tolist()
        ]
        prt_data3['username'] = [
            x + '_3' for x in prt_data3['username'].tolist()
        ]
        prt_data = pandas.concat([prt_data1, prt_data2, prt_data3], axis=0)

        for column in df.columns:

            # Merge the column with participant data
            sub = df.loc[:, [column]].merge(prt_data,
                                            left_index=True,
                                            right_on='username').dropna()
            sub.columns = ['value', 'username', 'gender', 'age', 'ancestry']

            if (sub.shape[0] < 20):
                continue

            model = smf.rlm(formula='value~age',
                            data=sub,
                            M=statsmodels.robust.norms.TrimmedMean())
            res = model.fit()

            if (res.pvalues['age'] < sig_level):

                #print "corrected age", column, res.pvalues['age']
                temp = pandas.DataFrame(res.resid, columns=[column])

                ## Do partial regression
                #temp = pandas.DataFrame(smf.ols(formula='value~age', data=sub).fit().resid, columns=[column])

                # Set the index
                temp.index = sub['username']

                # Update the original dataframe
                df.update(temp)

            # Merge the column with participant data
            sub = df.loc[:, [column]].merge(prt_data,
                                            left_index=True,
                                            right_on='username').dropna()

            sub.columns = ['value', 'username', 'gender', 'age', 'ancestry']

            model = smf.rlm(formula='value~C(gender)',
                            data=sub,
                            M=statsmodels.robust.norms.TrimmedMean())
            res = model.fit()

            if (res.pvalues['C(gender)[T.M]'] < sig_level):

                temp = pandas.DataFrame(res.resid, columns=[column])

                ## Do partial regression
                #temp = pandas.DataFrame(smf.ols(formula='value~C(gender)', data=sub).fit().resid, columns=[column])

                # Set the index
                temp.index = sub['username']

                # Update the original dataframe
                df.update(temp)

        # Now split by round and calculate the delta values
        df['round'] = [int(x[-1]) for x in df.index.tolist()]
        df['username'] = [x.split('_')[0] for x in df.index.tolist()]
        r1 = df[(df['round'] == 1)].set_index('username').drop('round', 1)
        r2 = df[(df['round'] == 2)].set_index('username').drop('round', 1)
        r3 = df[(df['round'] == 3)].set_index('username').drop('round', 1)
        r1_r2 = r2 - r1
        r2_r3 = r3 - r2
        r1_r2.index = ["%s_1" % x for x in r1_r2.index.tolist()]
        r2_r3.index = ["%s_2" % x for x in r2_r3.index.tolist()]
        joined = pandas.concat([r1_r2, r2_r3], axis=0)
        return self._apply_restrictions(joined)
Exemplo n.º 22
0
def fit(model, data, model_type='ols', sample_rate=.8, figsize=(10, 10), fontsize=12):
    """
    Linear regression model with visualization of fitting parameters

    :param model: patsy model specification
    :param data: padas dataframe of results
    :param model_type: ols or rlm  (ordinary least squares or robust linear model)
    :param sample_rate: float (range of 0 to 1).  partions traning and testing set
    :param figsize: figure size of output
    :param fontsize: fonsize for regression output
    :return: tuple of axes.
    """

    full = data.copy()
    mask = np.random.uniform(low=0, high=1, size=len(full))

    if sample_rate < 1:
        train = data[mask <= sample_rate]
        test = data[mask > sample_rate]
    else:
        train = full

    y, x = patsy.dmatrices(model, train)
    
    # we never want to plot the intercept, so need to make space if it is missing
    has_int = 0
    for name in x.design_info.column_names:
        if name == 'Intercept':
            has_int = 1

    if model_type.lower() == 'ols':
        model = smf.ols(model, data=train,)
    elif model_type.lower() == 'rlm':
        model = smf.rlm(model, data=train)

    setattr(model.data.orig_exog, 'design_info', x.design_info)  # some bug in patsy makes me do this...
    results = model.fit()
        
    # get predict values from confirmation data set.
    if sample_rate < 1:
        yfit = results.predict(test)

        # bug in statsmodels should be fixed in 0.7.
        if yfit.shape != test[model.endog_names].shape:
            sample_rate = 1
    
    summ = results.summary2()

    var = model.endog_names  # y variable name
    
    # track all categorical variables because they are separated in the design
    # matrix.  We need to adjust the plotting later on to lump all the categoricals into one.
    categoricals = [c for c in x.design_info.column_names if c.startswith('C(')]
    cat_plots = set([re.split('[()]', x)[1] for x in categoricals])
    
    # generate linear regression line of the acdtual vs predicted plot.
    # slope, intercept, r_value, p_value, std_err = sps.linregress(y[:, 0], results.fittedvalues)
    xs = np.linspace(np.min(y[:, 0]), np.max(y[:, 0]), 100)
    
    # determine how many columns we need on the plot
    r, c = np.shape(x)
    if len(categoricals) > 0:
        c = c - len(categoricals) + len(cat_plots) - has_int
        
    # make a min of 3 columns, even if there are less factors
    if c < 3:
        c = 3
    
    fig = plt.figure(tight_layout=True, figsize=figsize)
    grid = gs.GridSpec(3, c)

    # model axis
    axm = fig.add_subplot(grid[0, 0:c-1])
    axm.scatter(y[:, 0], results.fittedvalues, label='Train')
    axm.plot(xs, xs, 'k--')
    
    # plot the sampled data along with the fitted data
    if sample_rate < 1:
        axm.scatter(test[var], yfit, label='Test', color='red')
    
    axm.set_title('Actual vs Predicted Plot')
    axm.set_ylabel('{} Predicted'.format(var))
    axm.set_xlabel(var)
    plt.setp(axm.xaxis.get_majorticklabels(), rotation=90)

    try:
        fig.text(.1, .9, '$R^2$ = {}\nRMSE = {}'.format(round(results.rsquared, 2), round(results.mse_total**.5, 4)))
    except AttributeError:
        pass

    axm.legend(scatterpoints=1, loc=4)

    # histogram axis
    axh = fig.add_subplot(grid[0, c-1])
    axh.hist(list(results.resid))
    axh.set_title('Model Residuals')
    plt.setp(axh.xaxis.get_majorticklabels(), rotation=90)
    
    # text axis
    axt = fig.add_subplot(grid[2, :])
    axt.text(0, 1, summ.as_text(),
             horizontalalignment='left', 
             verticalalignment='top', 
             family='Courier New',
             fontsize=fontsize,
             weight='semibold'
             )
    axt.axis('off')

    # plot scatter plots in factor row
    numcats = 0
    column = 1
    ski = 0
    for i, factor in enumerate(x.T):
        if x.design_info.column_names[i] == 'Intercept':  # skip the intercept
            ski = 1
            continue
        if x.design_info.column_names[i].startswith('C('):
            numcats += 1
            continue
        
        column = i - numcats - ski
            
        # draw scatter plot
        ax = fig.add_subplot(grid[1, column])
        ax.scatter(factor, y[:, 0])
        ax.set_xlabel(x.design_info.column_names[i])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)

        # draw dotted linear regression line on each factor plot
        slope, intercept, r_value, p_value, std_err = sps.linregress(factor, y[:, 0])
        xs = np.linspace(np.min(factor), np.max(factor), 100)
        ys = xs * slope + intercept
        ax.plot(xs, ys, 'r--')
        ax.set_ylabel(var)

    # plot categorical plots in factor row
    for i, factor in enumerate(cat_plots, start=column+1):
        ax = fig.add_subplot(grid[1, i])
        ax = full.boxplot(column=var, by=factor, ax=ax, showfliers=False)
        ax.set_title('')
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)

    fig.suptitle('')

    return fig, (fig.get_axes()), results
Exemplo n.º 23
0

print infl.summary_frame().ix['minister']


sidak = ols_model.outlier_test('sidak')
sidak.sort('unadj_p', inplace=True)
print sidak


fdr = ols_model.outlier_test('fdr_bh')
fdr.sort('unadj_p', inplace=True)
print fdr


rlm_model = rlm('prestige ~ income + education', prestige).fit()
print rlm_model.summary()


print rlm_model.weights


#### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points

# * Data is on the luminosity and temperature of 47 stars in the direction of Cygnus.

dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data


from matplotlib.patches import Ellipse
fig = plt.figure(figsize=(12,8))
Exemplo n.º 24
0
def fit(model,
        data,
        model_type='ols',
        sample_rate=.8,
        figsize=(10, 10),
        fontsize=12):
    """
    Linear regression model with visualization of fitting parameters

    :param model: patsy model specification
    :param data: padas dataframe of results
    :param model_type: ols or rlm  (ordinary least squares or robust linear model)
    :param sample_rate: float (range of 0 to 1).  partions traning and testing set
    :param figsize: figure size of output
    :param fontsize: fonsize for regression output
    :return: tuple of axes.
    """

    full = data.copy()
    mask = np.random.uniform(low=0, high=1, size=len(full))

    if sample_rate < 1:
        train = data[mask <= sample_rate]
        test = data[mask > sample_rate]
    else:
        train = full

    y, x = patsy.dmatrices(model, train)

    # we never want to plot the intercept, so need to make space if it is missing
    has_int = 0
    for name in x.design_info.column_names:
        if name == 'Intercept':
            has_int = 1

    if model_type.lower() == 'ols':
        model = smf.ols(
            model,
            data=train,
        )
    elif model_type.lower() == 'rlm':
        model = smf.rlm(model, data=train)

    setattr(model.data.orig_exog, 'design_info',
            x.design_info)  # some bug in patsy makes me do this...
    results = model.fit()

    # get predict values from confirmation data set.
    if sample_rate < 1:
        yfit = results.predict(test)

        # bug in statsmodels should be fixed in 0.7.
        if yfit.shape != test[model.endog_names].shape:
            sample_rate = 1

    summ = results.summary2()

    var = model.endog_names  # y variable name

    # track all categorical variables because they are separated in the design
    # matrix.  We need to adjust the plotting later on to lump all the categoricals into one.
    categoricals = [
        c for c in x.design_info.column_names if c.startswith('C(')
    ]
    cat_plots = set([re.split('[()]', x)[1] for x in categoricals])

    # generate linear regression line of the acdtual vs predicted plot.
    # slope, intercept, r_value, p_value, std_err = sps.linregress(y[:, 0], results.fittedvalues)
    xs = np.linspace(np.min(y[:, 0]), np.max(y[:, 0]), 100)

    # determine how many columns we need on the plot
    r, c = np.shape(x)
    if len(categoricals) > 0:
        c = c - len(categoricals) + len(cat_plots) - has_int

    # make a min of 3 columns, even if there are less factors
    if c < 3:
        c = 3

    fig = plt.figure(tight_layout=True, figsize=figsize)
    grid = gs.GridSpec(3, c)

    # model axis
    axm = fig.add_subplot(grid[0, 0:c - 1])
    axm.scatter(y[:, 0], results.fittedvalues, label='Train')
    axm.plot(xs, xs, 'k--')

    # plot the sampled data along with the fitted data
    if sample_rate < 1:
        axm.scatter(test[var], yfit, label='Test', color='red')

    axm.set_title('Actual vs Predicted Plot')
    axm.set_ylabel('{} Predicted'.format(var))
    axm.set_xlabel(var)
    plt.setp(axm.xaxis.get_majorticklabels(), rotation=90)

    try:
        fig.text(
            .1, .9,
            '$R^2$ = {}\nRMSE = {}'.format(round(results.rsquared, 2),
                                           round(results.mse_total**.5, 4)))
    except AttributeError:
        pass

    axm.legend(scatterpoints=1, loc=4)

    # histogram axis
    axh = fig.add_subplot(grid[0, c - 1])
    axh.hist(list(results.resid))
    axh.set_title('Model Residuals')
    plt.setp(axh.xaxis.get_majorticklabels(), rotation=90)

    # text axis
    axt = fig.add_subplot(grid[2, :])
    axt.text(0,
             1,
             summ.as_text(),
             horizontalalignment='left',
             verticalalignment='top',
             family='Courier New',
             fontsize=fontsize,
             weight='semibold')
    axt.axis('off')

    # plot scatter plots in factor row
    numcats = 0
    column = 1
    ski = 0
    for i, factor in enumerate(x.T):
        if x.design_info.column_names[i] == 'Intercept':  # skip the intercept
            ski = 1
            continue
        if x.design_info.column_names[i].startswith('C('):
            numcats += 1
            continue

        column = i - numcats - ski

        # draw scatter plot
        ax = fig.add_subplot(grid[1, column])
        ax.scatter(factor, y[:, 0])
        ax.set_xlabel(x.design_info.column_names[i])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)

        # draw dotted linear regression line on each factor plot
        slope, intercept, r_value, p_value, std_err = sps.linregress(
            factor, y[:, 0])
        xs = np.linspace(np.min(factor), np.max(factor), 100)
        ys = xs * slope + intercept
        ax.plot(xs, ys, 'r--')
        ax.set_ylabel(var)

    # plot categorical plots in factor row
    for i, factor in enumerate(cat_plots, start=column + 1):
        ax = fig.add_subplot(grid[1, i])
        ax = full.boxplot(column=var, by=factor, ax=ax, showfliers=False)
        ax.set_title('')
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)

    fig.suptitle('')

    return fig, (fig.get_axes()), results
student = infl.summary_frame()['student_resid']
print(student)

print(student.loc[np.abs(student) > 2])

print(infl.summary_frame().loc['minister'])

sidak = ols_model.outlier_test('sidak')
sidak.sort_values('unadj_p', inplace=True)
print(sidak)

fdr = ols_model.outlier_test('fdr_bh')
fdr.sort_values('unadj_p', inplace=True)
print(fdr)

rlm_model = rlm('prestige ~ income + education', prestige).fit()
print(rlm_model.summary())

print(rlm_model.weights)

# ### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points

# * Data is on the luminosity and temperature of 47 stars in the direction
# of Cygnus.

dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data

from matplotlib.patches import Ellipse
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(
    111,
Exemplo n.º 26
0
 def fit(self, df, formula):
     return smf.rlm(formula=formula, data=df).fit()
plt.figure(figsize=(5, 2))
seaborn.boxplot(data.nb_passengers_2001 - data.nb_passengers_2000)
plt.title('NB passengers: 2001 - 2000')
plt.subplots_adjust()


##############################################################################
# Statistical testing: dependence of fare on distance and number of
# passengers
import statsmodels.formula.api as sm

result = sm.ols(formula='fare ~ 1 + dist + nb_passengers', data=data_flat).fit()
print(result.summary())

# Using a robust fit
result = sm.rlm(formula='fare ~ 1 + dist + nb_passengers', data=data_flat).fit()
print(result.summary())


##############################################################################
# Statistical testing: regression of fare on distance: 2001/2000 difference

result = sm.ols(formula='fare_2001 - fare_2000 ~ 1 + dist', data=data).fit()
print(result.summary())

# Plot the corresponding regression
data['fare_difference'] = data['fare_2001'] - data['fare_2000']
seaborn.lmplot(x='dist', y='fare_difference', data=data)

plt.show()
Exemplo n.º 28
0
regress_b['Fit'] = data_b[:, 2]
fig = plt.figure(1, figsize=(12, 4))
fig.suptitle('')
ax1, ax2 = fig.subplots(1, 2)

ax1.plot(regress_a['Temperatur'], regress_a['Spannung'], 'ro')
ax1.plot(regress_a['Temperatur'], regress_a['Fit'], 'r--')
ax1.plot(regress_b['Temperatur'], regress_b['Spannung'], 'bo')
ax1.plot(regress_b['Temperatur'], regress_b['Fit'], 'b--')
ax1.axis([-10, 110, 2, 5])
ax1.set_xlabel('Temperatur $T$ / °C')
ax1.set_ylabel('Spannung $U$ / V')
ax1.set_title('Robuste Regression')
ax1.grid(True)
""" Lineares Robuste Regression definieren und berechnen """

model = rlm("Spannung ~ Temperatur", regress_a,
            M=sm.robust.norms.AndrewWave()).fit()
print(model.summary())
regress_a['Fit_robust'] = model.fittedvalues
regress_a['Gewichtungsfaktor'] = model.weights
""" Darstellung der robusten Regressionsfunktion und der Gewichte """

ax1.plot(regress_a['Temperatur'], regress_a['Fit_robust'], 'g')

ax2.bar(regress_a['Temperatur'], regress_a['Gewichtungsfaktor'], 5, color='b')
ax2.set_xlabel('Temperatur $T$ / °C')
ax2.grid(True)
ax2.set_ylabel('Gewichtungsfaktor')
Exemplo n.º 29
0
import statsmodels.formula.api as smf
import statsmodels.api as sm
import pandas
import seaborn.apionly as sns
#iris = sns.load_dataset('iris')
#print(iris.head())
# Fit model and print summary
#rlm_model = smf.rlm(formula='sepal_length ~ sepal_width + petal_length + petal_width', data=iris, M=None)

data = sm.datasets.get_rdataset('epil', package='MASS').data

fam = sm.families.Poisson()
ind = sm.cov_struct.Exchangeable()
mod = smf.rlm("y ~ age + trt + base", data=data)
r = mod.fit()
print(r.summary())
Exemplo n.º 30
0
student = infl.summary_frame()["student_resid"]
print(student)

print(student.loc[np.abs(student) > 2])

print(infl.summary_frame().loc["minister"])

sidak = ols_model.outlier_test("sidak")
sidak.sort_values("unadj_p", inplace=True)
print(sidak)

fdr = ols_model.outlier_test("fdr_bh")
fdr.sort_values("unadj_p", inplace=True)
print(fdr)

rlm_model = rlm("prestige ~ income + education", prestige).fit()
print(rlm_model.summary())

print(rlm_model.weights)

# ### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points

# * Data is on the luminosity and temperature of 47 stars in the direction
# of Cygnus.

dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data

from matplotlib.patches import Ellipse

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(
Exemplo n.º 31
0
def test_missing():
    # see 2083
    import statsmodels.formula.api as smf

    d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan]}
    mod = smf.rlm('Foo ~ Bar', data=d)
Exemplo n.º 32
0
def smafit(X0,
           Y0,
           W0=None,
           cl=0.95,
           intercept=True,
           robust=False,
           rmethod='FastMCD'):
    """Standard Major-Axis (SMA) line fitting
    
    Calculate standard major axis, aka reduced major axis, fit to 
    data X and Y. The main advantage of this over ordinary least squares is 
    that the best fit of Y to X will be the same as the best fit of X to Y.
    
    The fit equations and confidence intervals are implemented following 
    Warton et al. (2006). Robust fits use the FastMCD covariance estimate 
    from Rousseeuw and Van Driessen (1999). While there are many alternative 
    robust covariance estimators (e.g. other papers by D.I. Warton using M-estimators), 
    the FastMCD algorithm is default in Matlab. When the standard error or 
    uncertainty of each point is known, then weighted SMA may be preferrable to 
    robust SMA. The conventional choice of weights for each point i is 
    W_i = 1 / ( var(X_i) + var(Y_i) ), where var() is the variance 
    (squared standard error).
    
    References 
    Warton, D. I., Wright, I. J., Falster, D. S. and Westoby, M.: 
        Bivariate line-fitting methods for allometry, Biol. Rev., 81(02), 259, 
        doi:10.1017/S1464793106007007, 2006.
    Rousseeuw, P. J. and Van Driessen, K.: A Fast Algorithm for the Minimum 
        Covariance Determinant Estimator, Technometrics, 41(3), 1999.

    Parameters
    ----------
    X, Y : array_like
        Input values, Must have same length.
    W    : optional array of weights for each X-Y point, typically W_i = 1/(var(X_i)+var(Y_i)) 
    cl   : float (default = 0.95)
        Desired confidence level for output. 
    intercept : boolean (default=True)
        Specify if the fitted model should include a non-zero intercept.
        The model will be forced through the origin (0,0) if intercept=False.
    robust : boolean (default=False)
        Use statistical methods that are robust to the presence of outliers
    rmethod: string (default='FastMCD')
        Method for calculating robust variance and covariance. Options:
        'MCD' or 'FastMCD' for Fast MCD
        'Huber' for Huber's T: reduce, not eliminate, influence of outliers
        'Biweight' for Tukey's Biweight: reduces then eliminates influence of outliers

        
    Returns
    -------
    Slope     : float
        Slope or Gradient of Y vs. X
    Intercept : float
        Y intercept.
    ste_grad : float
        Standard error of gradient estimate
    ste_int : float
        standard error of intercept estimate
    ci_grad : [float, float]
        confidence interval for gradient at confidence level cl
    ci_int : [float, float]
        confidence interval for intercept at confidence level cl
    """

    import numpy as np
    import scipy.stats as stats
    from sklearn.covariance import MinCovDet
    import statsmodels.formula.api as smf
    import statsmodels.robust.norms as norms

    # Make sure arrays have the same length
    assert (len(X0) == len(Y0)), 'Arrays X and Y must have the same length'
    if (W0 != None):
        assert (
            len(W0) == len(X0)), 'Array W must have the same length as X and Y'

    # Make sure cl is within the range 0-1
    assert (cl < 1), 'cl must be less than 1'
    assert (cl > 0), 'cl must be greater than 0'

    if (W0 == None):
        W0 = np.zeros_like(X0) + 1

    # Drop any NaN elements of X or Y
    # Infinite values are allowed but will make the result undefined
    idx = ~np.logical_or(np.isnan(X0), np.isnan(Y0))

    X = X0[idx]
    Y = Y0[idx]
    W = W0[idx]

    # Number of observations
    N = len(X)

    # Degrees of freedom for the model
    if (intercept):
        dfmod = 2
    else:
        dfmod = 1

    # Choose whether to use methods robust to outliers
    if (robust):

        # Choose the robust method
        if ((rmethod.lower() == 'mcd') or (rmethod.lower() == 'fastmcd')):
            # FAST MCD

            if (not intercept):
                # intercept=False could possibly be supported by calculating
                # using mcd.support_ as weights in an explicit variance/covariance calculation
                raise NotImplementedError(
                    'FastMCD method only supports SMA with intercept')

            # Fit robust model of mean and covariance
            mcd = MinCovDet().fit(np.array([X, Y]).T)

            # Robust mean
            Xmean = mcd.location_[0]
            Ymean = mcd.location_[1]

            # Robust variance of X, Y
            Vx = mcd.covariance_[0, 0]
            Vy = mcd.covariance_[1, 1]

            # Robust covariance
            Vxy = mcd.covariance_[0, 1]

            # Number of observations used in mean and covariance estimate
            # excludes observations marked as outliers
            N = mcd.support_.sum()

        elif ((rmethod.lower() == 'biweight') or (rmethod.lower() == 'huber')):

            # Tukey's Biweight and Huber's T
            if (rmethod.lower() == 'biweight'):
                norm = norms.TukeyBiweight()
            else:
                norm = norms.HuberT()

            # Get weights for downweighting outliers
            # Fitting a linear model the easiest way to get these
            # Options include "TukeyBiweight" (totally removes large deviates)
            # "HuberT" (linear, not squared weighting of large deviates)
            rweights = smf.rlm('y~x+1', {'x': X, 'y': Y}, M=norm).fit().weights

            # Sum of weight and weights squared, for convienience
            rsum = np.sum(rweights)
            rsum2 = np.sum(rweights**2)

            # Mean
            Xmean = np.sum(X * rweights) / rsum
            Ymean = np.sum(Y * rweights) / rsum

            # Force intercept through zero, if requested
            if (not intercept):
                Xmean = 0
                Ymean = 0

            # Variance & Covariance
            Vx = np.sum((X - Xmean)**2 * rweights**2) / rsum2
            Vy = np.sum((Y - Ymean)**2 * rweights**2) / rsum2
            Vxy = np.sum((X - Xmean) * (Y - Ymean) * rweights**2) / rsum2

            # Effective number of observations
            N = rsum

        else:

            raise NotImplementedError(
                "smafit.py hasn't implemented rmethod={:%s}".format(rmethod))
    else:

        if (intercept):

            wsum = np.sum(W)

            # Average values
            Xmean = np.sum(X * W) / wsum
            Ymean = np.sum(Y * W) / wsum

            # Covariance matrix
            cov = np.cov(X, Y, ddof=1, aweights=W**2)

            # Variance
            Vx = cov[0, 0]
            Vy = cov[1, 1]

            # Covariance
            Vxy = cov[0, 1]

        else:

            # Force the line to pass through origin by setting means to zero
            Xmean = 0
            Ymean = 0

            wsum = np.sum(W)

            # Sum of squares in place of variance and covariance
            Vx = np.sum(X**2 * W) / wsum
            Vy = np.sum(Y**2 * W) / wsum
            Vxy = np.sum(X * Y * W) / wsum

    # Standard deviation
    Sx = np.sqrt(Vx)
    Sy = np.sqrt(Vy)

    # Correlation coefficient (equivalent to np.corrcoef()[1,0] for non-robust cases)
    R = Vxy / np.sqrt(Vx * Vy)

    #############
    # SLOPE

    Slope = np.sign(R) * Sy / Sx

    # Standard error of slope estimate
    ste_slope = np.sqrt(1 / (N - dfmod) * Sy**2 / Sx**2 * (1 - R**2))

    # Confidence interval for Slope
    B = (1 - R**2) / (N - dfmod) * stats.f.isf(1 - cl, 1, N - dfmod)
    ci_grad = Slope * (np.sqrt(B + 1) + np.sqrt(B) * np.array([-1, +1]))

    #############
    # INTERCEPT

    if (intercept):
        Intercept = Ymean - Slope * Xmean

        # Standard deviation of residuals
        # New Method: Formula from smatr R package (Warton)
        # This formula avoids large residuals of outliers when using robust=True
        Sr = np.sqrt(
            (Vy - 2 * Slope * Vxy + Slope**2 * Vx) * (N - 1) / (N - dfmod))

        # OLD METHOD
        # Standard deviation of residuals
        #resid = Y - (Intercept + Slope * X )
        # Population standard deviation of the residuals
        #Sr = np.std( resid, ddof=0 )

        # Standard error of the intercept estimate
        ste_int = np.sqrt(Sr**2 / N + Xmean**2 * ste_slope**2)

        # Confidence interval for Intercept
        tcrit = stats.t.isf((1 - cl) / 2, N - dfmod)
        ci_int = Intercept + ste_int * np.array([-tcrit, tcrit])

    else:

        # Set Intercept quantities to zero
        Intercept = 0
        ste_int = 0
        ci_int = np.array([0, 0])

    return Slope, Intercept, ste_slope, ste_int, ci_grad, ci_int