-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_yescom.py
149 lines (128 loc) · 4.44 KB
/
scrape_yescom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Ian Young
# December 2008
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##
## Scrape Yes.com to retrieve recently played songs.
## Should be run regularly (every 15 minutes works well for me).
##
import urllib
from pysqlite2 import dbapi2
import re
import datetime
import sys
import os
import simplejson
from pytz import timezone
import random
import time
debug = False
# Initialize the databases. Should be called once when first starting collection.
def create_tables(db):
connection = dbapi2.connect(db)
c = connection.cursor()
# Comment all these statements if not starting with a blank slate
c.execute('drop table if exists songs')
c.execute('''create table songs
(id integer primary key,
time_played time,
date_played date,
artist text,
title text)''')
# The most important thing this does is store the "next" value that we use in requests
c.execute('drop table if exists last_parsed')
c.execute('''create table last_parsed
(id integer primary key,
next_id int,
date date,
count int)''')
# Put a dummy value in here
# This will start us off with a query that doesn't get any entries but gets the
# most recent "next" value. This entry may be deleted once a running database exists
c.execute('insert into last_parsed (next_id, date, count) values (1, "1980-01-01", 0)')
connection.commit()
c.close()
def append_songs(db, url):
connection = dbapi2.connect(db)
c = connection.cursor()
thisdate = datetime.date.today()
c.execute('select next_id from last_parsed order by id desc limit 1');
nextAt = c.fetchone()
if (nextAt):
nextAt = nextAt[0]
else:
nextAt = 1
url = url + '/ev/' + str(nextAt)
url = url + '?tos=http://www.iangreenleaf.com/TermsOfService'
page = urllib.urlopen(url)
result = simplejson.load(page)
count = 0
for song in recent_songs(result):
dt = song[0]
date = dt.date()
time = dt.time()
artist = song[1]
title = song[2]
c.execute('insert into songs\
(date_played, time_played, artist, title)\
values (?, ?, ?, ?)', (date.isoformat(), time.isoformat(), artist, title))
count = count + 1
next_id = result['next']
c.execute('insert into last_parsed (next_id, date, count)\
values (?, ?, ?)', (next_id, thisdate.isoformat(), count))
connection.commit()
c.close()
return
def recent_songs(result):
songs = result['entries']
for row in songs:
if row['type'] != 'song' \
or 'title' not in row \
or 'by' not in row:
continue
song_title = row['title']
artist = row['by']
artist = normalize(artist)
song_title = normalize(song_title)
central = timezone('US/Central')
dt_parsed = datetime.datetime.fromtimestamp(row['at'] / 1000, central)
dt_parsed = dt_parsed.replace(microsecond = 0, second = 0)
yield (dt_parsed, artist, song_title)
def normalize(string):
string = unicode(string)
# Filter anything out of the ordinary, including punctuation
rexp = re.compile('[^\w\s\d]')
string = rexp.sub('', string)
# Compress whitespace
rexp = re.compile('\s+')
string = rexp.sub(' ', string)
# Lowercase everything
string = string.lower()
# trim whitespace
string = string.strip()
return string
stations = {'jackfm': 'http://r2b.yes.com/relay/fead789258e8b63acb4c17cfb5ded00d1fa2a3e9',
'wlte': 'http://r2a.yes.com/relay/094a1780a307bd1610ba90fa4db7ab8445c1ecd7',
'kqrs': 'http://r2a.yes.com/relay/863360c4ef7d198c8f12ef69a1f18af96300a23f',
'love105': 'http://r2b.yes.com/relay/1531a73d169ff69181a601070783a7bf0a8ce991',
'93x': 'http://r2b.yes.com/relay/741844eeb0a159f60d2649c44af4c2f9fc7b86e9',
'b96': 'http://r2a.yes.com/relay/e4e3a18874dbd40fc0005626eaa34784fd70cb64',
'ks95': 'http://r2b.yes.com/relay/9b252e6e30593bf881ba28937bb06a93caf238bf'}
sum = 0
for key in stations:
db = os.path.abspath(os.path.dirname(__file__)) + '/' + key + '.sqlite'
url = stations[key]
#create_tables(db)
append_songs(db, url)
wait_time = random.randrange(0, 16, 1)
time.sleep(wait_time)