Exemplo n.º 1
0
def save_img_in_url(url, fn='tempimg', maxdepth=1, currentdepth=1 ):
    try:
        if currentdepth > maxdepth:
            raise TooDeepError( maxdepth )

        # get resource
        goturl = urlp.fetch_url(url)

        # check type and save
        if 'image' ==  goturl.info().getmaintype():
            with open(fn,'wb') as f:
                f.write(urlp.fetch_url(url).read())
                f.close()
                dmsg('saved img:'+url)

        elif 'text/html' == goturl.info().gettype():
            imglist = bs(goturl)('img')
            if len(imglist) > 0:
                for img in imglist:
                    save_img_in_url(img['src'], fn, maxdepth, currentdepth+1)
            else:
                dmsg("no img found")
                raise ImgNotFoundError( url, "no img found in the given url" )

        else:
            raise OpenUrlError(url,'unknown type' + goturl.info().gettype())

    except:
        logfn = fn+'.saveerror'
        l = open(logfn,'w')
        l.write(url)
        l.close()
        raise GrabImgError
Exemplo n.º 2
0
def main(url):
    step = 1
    while url:
        print
        print "step", step
        print "page", url

        soup = bs(urlp.fetch_url(url))
        try:
            prob_save_post(soup)
        except NoTimestampError:
            print "missing timestamp", url

        step = step + 1
        if step > num_limit:
            break

        url = newer_page(soup)
        if url is None:
            break

        sleep(sleep_between_post)