/
brfss_corr.py
74 lines (54 loc) · 2.05 KB
/
brfss_corr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
NAME: brfss_corr.py
"""
import math
import matplotlib
import matplotlib.pyplot as pyplot
import random
import sys
import brfss_scatter
import correlation
import _19_brfss
def _log(t):
"""Computes the log of a sequence."""
return [math.log(x) for x in t]
def _compute_correlations():
resp = brfss_scatter.Respondents()
resp._read_records()
print('Number of records:', len(resp.records))
heights, weights = resp._get_height_weight()
pearson = correlation._corr(heights, weights)
print('Pearson correlation (weights):', pearson)
log_weights = _log(weights)
pearson = correlation._corr(heights, log_weights)
print('Pearson correlation (log weights):', pearson)
spearman = correlation._spearman_corr(heights, weights)
print('Spearman correlation (weights):', spearman)
inter, slope = correlation._least_squares(heights, log_weights)
print('Least squares inter, slope (log weights):', inter, slope)
res = correlation._residuals(heights, log_weights, inter, slope)
R2 = correlation._coef_determination(log_weights, res)
print('Coefficient of determination:', R2)
print('sqrt(R^2):', math.sqrt(R2))
def main(name):
_compute_correlations()
if __name__ == '__main__':
main(*sys.argv)
"""
Results:
Number of records: 414509
Pearson correlation (weights): 0.508736478974
Pearson correlation (log weights): 0.531728260599
Spearman correlation (weights): 0.541529498192
The Pearson correlation is low because of the effect of outliers.
Either of the others is a reasonable choice, but in this case because
we know the distribution of weights is lognormal, the log transform
might be the best choice.
If we didn't know what transform to use, I would be more inclined to
use Spearman's correlation, but I am less comfortable with it because
mapping to ranks is an information-losing transform, and Log is not.
"""