def getWebpage(link='', dataDir='webpages', timeSleep=0, cookies='', reLoad=False, debug=False, read=True, referer='', info='', retry_num=10): link = link.strip() if link == '': return createPath(dataDir) fname = combinePath(dataDir, toFname(cookies + link + info)) if not reLoad: try: f = open(fname, 'r') page = f.read() f.close() if debug: print 'read from cached file' return page except: pass if debug: print 'reading from web' time.sleep(timeSleep) for i in range(retry_num): try: page_info = urllib2.build_opener() page_info.addheaders = [('User-Agent', 'safari/536.25'), ('Cookie', cookies), ('Referer', referer)] page = page_info.open(link) if read: try: page = page.read() except: print 'error reading page, try again (until trying time reach 10)' print link continue break except (urllib2.HTTPError, urllib2.URLError), e: try: print e.code, except: pass page = '' time.sleep(timeSleep)
def getWebpage(link='', dataDir='webpages', timeSleep=0, cookies='', reLoad=False, debug=False, read=True,referer='', info='',retry_num=10): link=link.strip() if link=='': return createPath(dataDir) fname=combinePath(dataDir,toFname(cookies+link+info)) if not reLoad: try: f=open(fname,'r') page=f.read() f.close() if debug: print 'read from cached file' return page except: pass if debug: print 'reading from web' time.sleep(timeSleep) for i in range(retry_num): try: page_info = urllib2.build_opener() page_info.addheaders = [('User-Agent', 'safari/536.25'), ('Cookie', cookies), ('Referer',referer) ] page = page_info.open(link) if read: try: page=page.read() except: print 'error reading page, try again (until trying time reach 10)' print link continue break except (urllib2.HTTPError,urllib2.URLError), e: try: print e.code, except: pass page='' time.sleep(timeSleep)
for photo in photos: img=photo.find('img') if not img: continue if not img.has_key('alt'): continue name=img['alt'] if img.has_key('data-src'): url=img['data-src'] else: url=img['src'] url=url.replace('head','original') info.append((url,name)) return (album_name,info) def fetchAlbum((album_name,info),caption=False,type=None): createPath(album_name) i=0 #print len(info) for name,url in info: i+=1 ind=name.find('(') if ind>1: name=name[:ind] cap_content=name name=name.replace('/','') name=' '.join(name.split()) name=name.replace(' ','_') if name!='': fname=str(i)+'_'+name else: fname=str(i) name=combinePath(album_name,fname)+'.jpg'