This repository has been archived by the owner on May 21, 2022. It is now read-only.
/
fetch.py
68 lines (59 loc) · 2.1 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup as BS
from urllib2 import Request, urlopen as open_url
from urlparse import urlsplit as split_url
import re
from os import listdir, remove
from os.path import splitext
import Image
SOURCE_URL = "http://www.reddit.com/r/MapPorn/top/?sort=top&t=week"
HEADERS = {'User-Agent' : 'Mozilla/5 (Linux i386) Gecko'}
IMAGES = 'static/images/'
def fetch_new_images():
html = _fetch()
posts = html.findAll('a', 'title')
new_images = []
for post in posts:
url = post.attrMap['href']
url_parts = split_url(url)
if re.match('^.+\.imgur\.com', url_parts.netloc): # we only want to pull images from imgur
if url[-4:] not in ['.jpg', '.png']: # make sure we're pulling the image, not the HTML page
continue
new_images.append({'title': post.text, 'href': url, 'filename': _get_filename_from_url(url)})
_cache_images(new_images)
return new_images
def _fetch():
request = Request(SOURCE_URL, headers=HEADERS)
response = open_url(request)
page_content = response.read()
soup = BS(page_content)
return soup
def _cache_images(new_images):
for image in new_images:
new_filename = _get_filename_from_url(image['href'])
request = Request(image['href'], headers=HEADERS)
response = open_url(request)
extracted_jpeg_data = response.read()
f = open(IMAGES + new_filename, 'wb') #FIXME: catch exceptions?
f.write(extracted_jpeg_data)
f.close()
_scale_image(new_filename)
_purge_everything_but(new_images)
def _purge_everything_but(new_images):
files_to_keep = set([_get_filename_from_url(image['href']) for image in new_images])
existing_files = set(listdir(IMAGES))
files_to_remove = files_to_keep.difference(existing_files)
if files_to_remove:
for file_to_remove in files_to_remove:
remove(IMAGES + file_to_remove)
def _get_filename_from_url(url):
return split_url(url).path.split('/')[-1:][0]
def _scale_image(image):
print "Scaling image", image
size = 900, 500
try:
im = Image.open(IMAGES + image)
im.thumbnail(size, Image.ANTIALIAS)
im.save(IMAGES + image)
except IOError as e:
print "Can't scale down image", image, "because:", e