/
process.py
93 lines (78 loc) · 3.12 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import petl
import re
import json
data_2018 = petl.util.base.empty()
for filename in os.listdir('./data/2018/'):
data_2018 = data_2018.cat(petl.fromjson('./data/2018/'+filename))
data_2017 = petl.util.base.empty()
for filename in os.listdir('./data/2017/'):
data_2017 = data_2017.cat(petl.fromjson('./data/2017/'+filename))
data_2018 = data_2018.distinct('updated_at')
print(data_2018.nrows())
data_2017 = data_2017.distinct('updated_at')
print(data_2017.nrows())
# Fix observed song name changes
name_changes = {
'Have a Holly Jolly Christmas': 'A Holly Jolly Christmas',
'Merry Christmas Darling (Remix)': 'Merry Christmas Darling',
'The Chipmunk Song (feat. Alvin) [Christmas Don\'t Be Late]': 'The Chipmunk Song',
'Walkin In A Winter Wonderland': 'Winter Wonderland',
'Santa Claus Is Coming to Town (Intro)': 'Santa Claus Is Coming to Town',
'Santa Claus Is Comin\' to Town': 'Santa Claus Is Coming to Town',
'Have Yourself Merry Little Christmas': 'Have Yourself A Merry Little Christmas'
}
data_2017 = data_2017.convert('song_title', name_changes)
data_2018 = data_2018.convert('song_title', name_changes)
# Create normalized song identity columns
# Aggregations needed
song_2018 = (data_2018
.convert('song_title', lambda t: re.sub(r'[\(\[][Ff]eat.*$', '', t))
.addfield('song_title_id', lambda rec: re.sub(r'\W', '', rec.song_title).lower())
.aggregate(('song_title_id', 'artist'), {
'artist_count_2018': len,
'song_title_2018': lambda r: r[0].song_title,
})
.aggregate('song_title_id', {
'count_2018': ('artist_count_2018', sum),
'song_title_2018': lambda r: r[0].song_title_2018,
'artists_2018': (('artist', 'artist_count_2018',), list)
})
.sort('count_2018', reverse=True)
.addrownumbers(start=1, field='rank_2018')
)
song_2017 = (data_2017
.convert('song_title', lambda t: re.sub(r'[\(\[][Ff]eat.*$', '', t))
.addfield('song_title_id', lambda rec: re.sub(r'\W', '', rec.song_title).lower())
.aggregate(('song_title_id', 'artist'), {
'artist_count_2017': len,
'song_title_2017': lambda r: r[0].song_title,
})
.aggregate('song_title_id', {
'count_2017': ('artist_count_2017', sum),
'song_title_2017': lambda r: r[0].song_title_2017,
'artists_2017': (('artist', 'artist_count_2017',), list)
})
.sort('count_2017', reverse=True)
.addrownumbers(start=1, field='rank_2017')
)
totals = (
petl.outerjoin(song_2018, song_2017, 'song_title_id')
.select(lambda rec: (rec['count_2018'] or 0) >= 1 or (rec['count_2017'] or 0) > 2)
.addfield('song_title', lambda rec: rec.song_title_2018 or rec.song_title_2017)
.cutout('song_title_2018', 'song_title_2017', 'song_title_id')
)
def rank_change(rec):
if rec['rank_2017'] is None:
return 1000
elif rec['rank_2018'] is None:
return -1000
else:
return rec['rank_2017'] - rec['rank_2018']
winners_losers = totals.addfield('rank_change', rank_change)
winners_losers = winners_losers.sort('rank_change', reverse=True)
for entry in winners_losers.dicts():
print(json.dumps(entry) + ',')
# top2018 = totals.sort('count_2018', reverse=True)
# for entry in totals.dicts().islice(0,20):
# print(str(entry) + ',')