def save_img_in_url(url, fn='tempimg', maxdepth=1, currentdepth=1 ): try: if currentdepth > maxdepth: raise TooDeepError( maxdepth ) # get resource goturl = urlp.fetch_url(url) # check type and save if 'image' == goturl.info().getmaintype(): with open(fn,'wb') as f: f.write(urlp.fetch_url(url).read()) f.close() dmsg('saved img:'+url) elif 'text/html' == goturl.info().gettype(): imglist = bs(goturl)('img') if len(imglist) > 0: for img in imglist: save_img_in_url(img['src'], fn, maxdepth, currentdepth+1) else: dmsg("no img found") raise ImgNotFoundError( url, "no img found in the given url" ) else: raise OpenUrlError(url,'unknown type' + goturl.info().gettype()) except: logfn = fn+'.saveerror' l = open(logfn,'w') l.write(url) l.close() raise GrabImgError
def main(url): step = 1 while url: print print "step", step print "page", url soup = bs(urlp.fetch_url(url)) try: prob_save_post(soup) except NoTimestampError: print "missing timestamp", url step = step + 1 if step > num_limit: break url = newer_page(soup) if url is None: break sleep(sleep_between_post)