/
student_scores_comparison.py
82 lines (62 loc) · 2.58 KB
/
student_scores_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from matplotlib import pyplot as plt
import numpy as np
import unittest
""" Implementing a slide from a PyCon talk, Statistics for Hackers.
https://speakerdeck.com/pycon2016/jake-vanderplas-statistics-for-hackers
The question posed:
"Two students get the following scores over a term's modules:
student_1 84, 72, 57, 46, 63, 76, 99, 91: mean 73.5
student_2 81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69: mean 66.9
Student_1 has an average of 6.6% more than student_2.
Is this difference statistically significant?
"""
STUDENT_1 = [84, 72, 57, 46, 63, 76, 99, 91]
STUDENT_2 = [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]
STUDENT_3 = [73, 73, 73, 73, 73, 74, 74, 74]
STUDENT_4 = [67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 66]
SCORES_1 = np.array(STUDENT_1 + STUDENT_2)
SCORES_2 = np.array(STUDENT_3 + STUDENT_4)
def trial(scores):
np.random.shuffle(scores)
student_1 = scores[:8]
student_2 = scores[8:]
return student_1.mean() - student_2.mean()
def significance_bound(hist, x):
freq, widths, _ = hist
normed_freq = freq.cumsum() / freq.sum()
midpoints = (widths[:-1] + widths[1:]) / 2
return np.interp(x, normed_freq, midpoints)
def main():
tests = 100000
net_scores_1 = [trial(SCORES_1) for _ in range(tests)]
net_scores_2 = [trial(SCORES_2) for _ in range(tests)]
h1 = plt.hist(net_scores_1, 16, color='darkblue', alpha=0.4,
label=r'$scoring_1$')
h2 = plt.hist(net_scores_2, 16, color='red', alpha=0.7,
label=r'$scoring_2$')
actual_difference = 73.5 - 66.9
plt.axvline(actual_difference, lw=2, color='black',
label=r'$Actual\ score\ difference$')
plt.axvspan(significance_bound(h1, 0.05),
significance_bound(h1, 0.95),
color='skyblue',
alpha=0.3,
label=r'$2\sigma_1$')
plt.axvspan(significance_bound(h2, 0.05),
significance_bound(h2, 0.95),
color='salmon',
alpha=0.4,
label=r'$2\sigma_2$')
plt.legend()
plt.show()
class StudentScoreTesting(unittest.TestCase):
def test_significance_bound(self):
h = (np.array([39, 499, 1277, 1028, 3199, 2644, 556, 595, 154, 9]),
np.array([-4.375, -3.41666667, -2.45833333, -1.5, -0.54166667,
0.41666667, 1.375, 2.33333333, 3.29166667, 4.25,
5.20833333]),
None)
self.assertEqual(3.8187500014999998, significance_bound(h, 0.05))
if __name__ == '__main__':
# unittest.main()
main()