Пример #1
0
def spider_link():
    try:
        url = rdb.consumers(cf.get('redis', 'link_queue'))
        if url is not None:
            logging.info('crawler url is: ' + url)
            page = crawler_url.chrome_crawler(url, '', '')
            if page is not None:
                page = page.encode('utf-8')
                filename = 'link_' + tools.get_md5(url) + '.html'
                logging.info(filename)
                status = tools.gzip_file(filename, page)
                if status:
                    rdb.producers(cf.get('redis', 'html_queue'), filename)
                else:
                    rdb.producers(cf.get('redis', 'link_queue'), url)
    except Exception, e:
        logging.info(e)
        rdb.producers(cf.get('redis', 'link_queue'), url)
Пример #2
0
def spider_down():
    try:
        get_url = rdb.consumers(cf.get('redis', 'down_queue'))
        headers = {
            'User-agent':
            'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'
        }
        req = urllib2.Request(url=get_url, headers=headers)
        url = urllib2.urlopen(req, timeout=5)
        binary_data = url.read()
        filename = tools.get_md5(get_url) + '.jpg'
        status = tools.save_to_file(filename, binary_data)
        if status:
            logging.info('img download ok !')
        else:
            rdb.producers(cf.get('redis', 'down_queue'), get_url)
    except Exception, e:
        logging.info(e)
        rdb.producers(cf.get('redis', 'down_queue'), get_url)
Пример #3
0
def spider_down():
    try:
        down = rdb.consumers(cf.get('redis', 'down_queue'))
        js = json.loads(down)
        url = js['url']
        href = js['href']
        headers = {
            'User-agent':
            'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0',
            'Referer': url
        }
        req = urllib2.Request(url=href, headers=headers)
        openurl = urllib2.urlopen(req, timeout=5)
        binary_data = openurl.read()
        filename = tools.get_md5(href) + '.jpg'
        status = tools.save_to_file(filename, binary_data)
        if status:
            logging.info('img download ok !')
        else:
            rdb.producers(cf.get('redis', 'down_queue'), down)
    except Exception, e:
        logging.info(e)
        rdb.producers(cf.get('redis', 'down_queue'), down)