forked from jkeesh/scpd-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·107 lines (86 loc) · 3.6 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
import re
import os
import sys
from getpass import *
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
"""
This program downloads scpd videos for a given class in the order
that they happened as a wmv, then converts them to a mp4. Each time
the script is run, it will update to download all of the undownloaded
videos.
This script is modified from the one by Ben Newhouse (https://github.com/newhouseb).
Unfortunately, there are lots of dependencies to get it up and running
1. Handbrake CLI, for converting to mp4: http://handbrake.fr/downloads2.php
2. BeautifulSoup for parsing: http://www.crummy.com/software/BeautifulSoup/
3. Mechanize for emulating a browser, http://wwwsearch.sourceforge.net/mechanize/
Usage: python scrape.py [Stanford ID] "Interactive Computer Graphics"
The way I use it is to keep a folder of videos, and once I have watched them, move them
into a subfolder called watched. So it also wont redowload files that are in a subfolder
called watched.
"""
def convertToMp4(wmv, mp4):
print "Converting ", mp4
os.system('HandBrakeCLI -i %s -o %s' % (wmv, mp4))
os.system('rm -f %s' % wmv)
def download(work):
# work[0] is url, work[1] is wmv, work[2] is mp4
if os.path.exists(work[2]) or os.path.exists("watched/"+work[2]):
print "Already downloaded", work[2]
return
print "Starting", work[1]
os.system("mimms -c %s %s" % (work[0], work[1]))
# convertToMp4(work[1], work[2])
print "Finished", work[1]
def downloadAll(username, courseName):
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')]
br.set_handle_robots(False)
br.open("https://myvideosu.stanford.edu/oce/currentquarter.aspx")
assert br.viewing_html()
br.select_form(name="login")
br["username"] = username
br["password"] = getpass()
# Open the course page for the title you're looking for
print "Logging in to myvideosu.stanford.edu..."
response = br.submit()
print "Logged in, going to course link."
response = br.follow_link(text=courseName)
#print response.read()
# response = br.follow_link(text="HERE")
# print response.read()
# Build up a list of lectures
print "Loading video links."
links = []
for link in br.links(text="WMP"):
links.append(re.search(r"'(.*)'",link.url).group(1))
link_file = open('links.txt', 'w')
# So we download the oldest ones first.
links.reverse()
print "Found %d links, getting video streams."%(len(links))
videos = []
for link in links:
response = br.open(link)
soup = BeautifulSoup(response.read())
video = soup.find('object', id='WMPlayer')['data']
video = re.sub("http","mms",video)
video = video.replace(' ', '%20') # remove spaces, they break urls
output_name = re.search(r"[a-z]+[0-9]+[a-z]?/[0-9]+",video).group(0).replace("/","_") #+ ".wmv"
output_wmv = output_name + ".wmv"
link_file.write(video + '\n')
print video
output_mp4 = output_name + ".mp4"
videos.append((video, output_wmv, output_mp4))
link_file.close()
print "Downloading %d video streams."%(len(videos))
for video in videos:
download(video)
print "Done!"
if __name__ == '__main__':
if (len(sys.argv) != 3):
print "Usage: ./scrape.py [Stanford ID] 'Interactive Computer Graphics'"
else:
username = sys.argv[1]
courseName = sys.argv[2]
downloadAll(username, courseName)