示例#1
0
文件: sitemap.py 项目: sjl421/pyhome
class LagouSitemap(ThreadPoolCrawler):
    db = get_db('htmldb')
    col = getattr(db, 'lagou_url')  # collection

    def get(self, url, *args, **kwargs):
        headers = {
            'User-Agent':
            'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
        }
        return super(LagouSitemap, self).get(url,
                                             headers=headers,
                                             *args,
                                             **kwargs)

    def handle_response(self, url, response):
        """处理http响应,对于200响应码直接处理html页面,
        否则按照需求处理不同响应码"""
        self.logger.info('url:%s', url)
        if response.status_code == 200:
            self.handle_html(url, response.text)

    def init_urls(self):
        for i in range(1, 541):  # max is 540
            url = 'http://www.lagou.com/upload/sitemap/xml/lagou_sitemap_%d.xml' % i
            self.urls.append(url)

    def handle_html(self, url, html):
        all_loc = extract_all('<loc>', '</loc>', html)
        self.logger.info('%s', pformat(all_loc))
        self.col.insert_many([{'ur': url} for url in all_loc])
示例#2
0
 def __init__(self):
     tornado.web.Application.__init__(self, url_patterns, **settings)
     self._redis = _db.redis_client
     self._motor = get_db(CONFIG.MONGO.DATABASE, client='motor')
     connect(CONFIG.MONGO.DATABASE, host=CONFIG.MONGO.HOST,
             port=CONFIG.MONGO.PORT,
             io_loop=tornado.ioloop.IOLoop.current())    # motorengine
示例#3
0
class CheckXiciCralwer(ThreadPoolCrawler):
    """CheckXiciCralwer 用来测试代理的有效性,及时剔除没用的代理"""

    db = get_db('htmldb')
    col = getattr(db, 'xici_proxy')    # collection
    timeout = (10, 10)    # connect timeout and read timeout
    concurrency = 100

    def init_urls(self):
        """init_urls get all ip proxy from monggo"""
        url = 'http://www.lagou.com/'
        for ip_info in self.col.find(no_cursor_timeout=True):
            ip, port = ip_info['ip'], ip_info['port']
            if ip and port:
                self.urls.append((url, ip, port))    # tuple

    def get(self, url, proxies, timeout):
        headers = {
            'User-Agent': random_search_engine_ua()
        }
        return requests.get(
            url, proxies=proxies, timeout=timeout, headers=headers
        )

    def run_async(self):
        self.logger.info('before check %d proixes', self.col.count())

        for url_list in chunks(self.urls, 100):    # handle 100 every times
            pprint(url_list)
            with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) as executor:
                future_to_url = {
                    executor.submit(
                        self.get, url, proxies=get_proxy_dict(ip, int(port)), timeout=self.timeout
                    ): (url, ip, port)
                    for (url, ip, port) in url_list
                }
                for future in concurrent.futures.as_completed(future_to_url):
                    url, ip, port = future_to_url[future]
                    try:
                        response = future.result()
                        if response.status_code != 200:
                            self.logger.info(
                                'status_code %d ,delete proxy %s:%s',
                                response.status_code, ip, port
                            )
                            self.col.delete_one({'ip': ip, 'port': port})
                    except Exception as e:  # 之前使用的自己的get导致异常没raise
                        self.logger.info('delete proxy %s:%s', ip, port)
                        self.col.delete_one({'ip': ip, 'port': port})
                    else:
                        self.handle_response(url, response)

        self.logger.info('after check %d proixes', self.col.count())

    def handle_response(self, url, response):
        """handle_response 验证代理的合法性。通过发送简单请求检测是否超时"""
        if response:
            self.logger.info('url: %s %s', url, response.status_code)
示例#4
0
文件: proxy.py 项目: sjl421/pyhome
class XiciCrawler(ThreadPoolCrawler):

    db = get_db('htmldb')
    col = getattr(db, 'xici_proxy')  # collection
    sleep = 10

    def init_urls(self):
        urls = [
            'http://www.xicidaili.com/wn/%d',
            'http://www.xicidaili.com/wt/%d',
            'http://www.xicidaili.com/nn/%d',
            'http://www.xicidaili.com/nt/%d',
        ]
        for url in urls:
            for i in range(1, 10):
                self.urls.append(url % i)

    def bulk_update_to_mongo(self, ip_dict_list):
        """bulk_update_to_mongo

        :param ip_dict_list:
        OrderedDict([('country', 'Cn'), ('ip', u'115.46.80.120'), ('port', u'8123'), ('address', u'\u5e7f\u897f\u5357\u5b81'), ('anonymous', u'\u9ad8\u533f'), ('type', u'HTTP'), ('speed', u'3.687\u79d2'), ('connect_time', u'0.737\u79d2'), ('live_time', u'1\u5206\u949f'), ('verify_time', u'16-07-26 10:54')])
        """
        bulk = self.col.initialize_ordered_bulk_op()
        for ip_info_dict in ip_dict_list:
            self.logger.info('%s', ip_info_dict['ip'])
            query_dict = {
                'ip': ip_info_dict['ip'],
                'port': ip_info_dict['port'],
            }
            update_dict = {'$set': ip_info_dict}
            bulk.find(query_dict).upsert().update(update_dict)

        bulk.execute()
        self.logger.info('count %d', self.col.count())

    def handle_response(self, url, response):
        """handle_response 把代理ip的信息存储到mongodb中

        :param url:
        :param response: requests.models.Response
        """
        self.logger.info('handle url: %s', url)
        if not response:
            return
        if response.status_code == 200:
            html = response.text
            html_parser = XiciHtmlParser(url, html)
            ip_info_dict_yield = html_parser.parse()
            self.bulk_update_to_mongo(ip_info_dict_yield)
        elif response.status_code == 503:
            change_ip()
            self.urls.append(url)  # retry
示例#5
0
文件: proxy.py 项目: sjl421/pyhome
class KuaidailiCrawler(ThreadPoolCrawler):
    """http://www.kuaidaili.com/"""
    db = get_db('htmldb')
    col = getattr(db, 'kuaidaili_proxy')  # collection
    sleep = 10

    def init_urls(self):
        _range = 1, 10
        for i in range(_range[0], _range[1] + 1):
            url_list = [
                url % i for url in [
                    'http://www.kuaidaili.com/free/inha/%d/',
                    'http://www.kuaidaili.com/free/intr/%d/',
                    'http://www.kuaidaili.com/free/intr/%d/',
                    'http://www.kuaidaili.com/free/outha/%d/',
                ]
            ]
            self.urls.extend(url_list)

    def bulk_update_to_mongo(self, ip_dict_list):
        bulk = self.col.initialize_ordered_bulk_op()

        for ip_info_dict in ip_dict_list:
            self.logger.info('%s:%s', ip_info_dict['ip'], ip_info_dict['port'])
            query_dict = {
                'ip': ip_info_dict['ip'],
                'port': ip_info_dict['port'],
            }
            update_dict = {'$set': ip_info_dict}
            bulk.find(query_dict).upsert().update(update_dict)

        bulk.execute()
        self.logger.info('count %d', self.col.count())

    def handle_response(self, url, response):
        self.logger.info('handle url: %s', url)
        if not response:
            return
        if response.status_code == 200:
            html = response.text
            html_parser = KuaidailiHtmlParser(url, html)
            ip_info_dict_yield = html_parser.parse()
            self.bulk_update_to_mongo(ip_info_dict_yield)
        elif response.status_code == 503:
            change_ip()
            self.urls.append(url)  # retry
示例#6
0
class ParseJob(object):
    """用来处理抓下来的html页面,把需要的数据从html中提取出来单独存储"""

    db = get_db('htmldb')

    def __init__(self):
        self.from_col = getattr(self.db, 'lagou_html')
        self.to_col = getattr(self.db, 'lagou_job')
        self.key = self.__class__.__name__
        self.last_id = int(r.get(self.key) or 0)

    def set_id(self, last_id=0):
        r.set(self.key, last_id)

    def run_job(self):
        """lagou job页面的信息任务"""
        for doc_dict in self.from_col.find({
                '_id': {
                    '$gte': self.last_id
                }
        }).sort('_id', 1):

            if 'job' in doc_dict['url']:  # job url
                doc = ObjectDict(doc_dict)
                assert doc.url and doc.html
                if LagouCrawler.is_deleted_html(doc.html, False):
                    self.from_col.delete_one({'url': doc.url})
                    continue
                job_parser = LagouHtmlParser(doc.url, doc.html)

                data_dict = job_parser.parse_job()
                if data_dict is None:
                    self.from_col.delete_one({'url': doc.url})
                    continue

                self.logger.info('handle url: %s %s:%s', doc.url,
                                 data_dict['source'], data_dict['job'])
                if not DEBUG:
                    self.to_col.update({
                        '_id': doc._id,
                    }, {'$set': data_dict},
                                       upsert=True)
                self.set_id(doc._id)
示例#7
0
文件: ininin.py 项目: sjl421/pyhome
    url = re.sub('product_id=(\d+)&', 'product_id=%s&' % str(product_id), url)
    r = requests.get(url, headers=header_dict, data=data)    # keyword params
    # BEHBAAHHHAEAICGV({"result":0,"msg":"","data":{"result":0,"msg":"查询成功","productId":200021,"categoryId":39,"categoryName":"经典","productName":"“新”名片","productImg":"http://cloud.ininin.com/1453727435050.jpg","title":"“新”名片_铜版纸名片设计_铜版纸名片制作_铜版纸名片报价_云印","keywords":"铜版纸名片设计,铜版纸名片制作,铜版纸名片报价","description":"高档300克铜版纸,具有手感厚重,笔直挺括,质地密实、高白度、设计表现强特点。报价:最便宜3.5元至最贵59元/盒(100张),多款铜版纸名片,5种可选铜版纸名片处理工艺。","pImages":"http://cloud.ininin.com/1453727455067.jpg,http://cloud.ininin.com/1453727457303.jpg,http://cloud.ininin.com/1453727459607.jpg,http://cloud.ininin.com/1453727472730.jpg,http://cloud.ininin.com/1453727468168.jpg","priceDesc":"8元/盒起","simpleDesc":"“新”名片【铜版纸】——案头常备的优质名片,满99包邮!","productDesc":"[{\"title\":\"下单流程\",\"content\":\"\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727509640.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727519881.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1457590273025.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1470700220636.png\\\" style\u003d\\\"max-width:100%;\\\"/\u003e\u003c/p\u003e\u003cp\u003e\u003cbr/\u003e\u003c/p\u003e\u003cp\u003e\u003cbr/\u003e\u003c/p\u003e\"},{\"title\":\"产品介绍\",\"content\":\"\u003cdiv style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727574011.jpg\\\"/\u003e\u003c/div\u003e\"},{\"title\":\"使用场景\",\"content\":\"\"},{\"title\":\"规格参数\",\"content\":\"\"},{\"title\":\"下单须知\",\"content\":\"\"},{\"title\":\"物流说明\",\"content\":\"\"},{\"title\":\"售后服务\",\"content\":\"\"}]","baseInfoName":"材质类型_数量_成品尺寸-覆膜","preferntialInfo":"[{\"preferentialSort\":1,\"preferentialTitle\":\"优惠套餐\",\"preferentialDescription\":\"购买新名片印刷套餐,立享更多优惠\",\"preferentialLink\":\"http://design.ininin.com/category/131.html\"}]","addedServicesList":[],"params":{"300g铜版纸_1盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_2盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_5盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_10盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_20盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_40盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_100盒_90mm*54mm":{"覆膜":{"覆哑膜":1}}},"type":0,"standardType":0,"showType":0,"websiteShow":1,"homeShow":1,"homeShowIcon":1,"listShow":1,"listShowIcon":2,"minDistNum":-1,"targetId":"0","valuationMethod":0,"valuationValue":0.15,"productVariety":0}})
    content = r.content.decode('utf-8')
    content = content[content.find('(')+1: -2]
    return json.loads(content).get('data')


def _get_product_id_from_url(url='http://www.ininin.com/product/200021.html#300g铜版纸_1盒_90mm*54mm-覆哑膜'):
    """ _get_product_id_from_url
    :return: str product_id eg: 200021
    """
    return url.rsplit('/', 1)[1].split('.')[0]


_DB = get_db('ininin', client='mongo')
_COL = getattr(_DB, 'ininin_data')


def _replace_dot_key_to_dash(data_dict):
    """_replace_dot_key_to_dash mongo的key中不可以含有点,替换成-

    :param data_dict:
    """
    params_dict = data_dict.get('params')
    if not params_dict:
        return data_dict
    new_params_dict = {}
    for k, v in params_dict.items():
        k = k.replace('.', '-')
        new_params_dict[k] = v
示例#8
0
文件: spider.py 项目: sjl421/pyhome
class LagouCrawler(object):
    curl_str = """
    curl 'http://www.lagou.com/activityapi/icon/showIcon.json?callback=jQuery11130673730597542487_1469756732278&type=POSITION&ids=2034591%2C2147192%2C1899225%2C2112714%2C1993280%2C2107221%2C1980427%2C959204%2C1570458%2C1382996%2C2164841%2C1535725%2C2015991%2C1909703%2C1924731%2C1924585%2C1917417%2C1961327%2C1949207%2C1949217%2C1961114%2C1962767%2C1915882%2C1958811%2C1929575%2C1929708%2C1926524%2C1914752&_=1469756732279' -H 'Cookie: ctk=1469756728; JSESSIONID=006FA63ABE28DD910325F0A2B21D80DD; LGMOID=20160729094529-D8AB7E5EBC00B32D65F29DC499FDEEE0; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1469756733; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1469756733' -H 'X-Anit-Forge-Code: 0' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36' -H 'Accept: text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01' -H 'Referer: http://www.lagou.com/' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' -H 'X-Anit-Forge-Token: None' --compressed
    """
    base_url = CurlStrParser(curl_str).get_url()
    headers = CurlStrParser(curl_str).get_headers_dict()
    db = get_db('htmldb')
    col = getattr(db, 'lagou_html')    # collection
    sleep = 10

    def __init__(self, domain):
        self.domain = domain
        self.url_manager = UrlManager(domain)
        self.incr_id = IncrId(self.__class__.__name__)

    def add_url(self, url):
        self.url_manager.add_url(url)

    def delay_url(self, url, nums=10):
        self.logger.info('delay url: %s', url)
        self.url_manager.delay_url(url, nums)

    def add_url_list(self):
        for i in range(1, 532):
            url = 'http://www.lagou.com/upload/sitemap/xml/lagou_sitemap_%d.xml'%i
            self.logger.info('sitemap url: %s', url)
            html = self.get_response(url).text
            all_loc_url = extract_all('<loc>', '</loc>', html)
            self.logger.info('%s', pformat(all_loc_url))
            self.add_url(all_loc_url)

    def update_headers(self, changeip=True):
        if changeip:
            change_ip()
        r = get(self.base_url)
        h = cookie_dict_from_cookie_str(r.headers.get('Set-Cookie'))
        cookies_dict = cookie_dict_from_cookie_str(self.headers['Cookie'])
        cookies_dict.update(h)
        self.headers['Cookie'] = cookies_dict
        self.logger.info('headers: %s', pformat(self.headers))

    def get_response(self, url, **kwargs):
        if CONFIG.CRAWLER.USE_PROXY:
            kwargs.setdefault('proxies', CONFIG.CRAWLER.PROXIES)
        self.logger.info('now crawler: %s', url)
        return get(url, headers=self.headers, **kwargs)

    def url_nums(self):
        return self.url_manager.url_nums()

    def next_url(self, inorder=True):
        if inorder:
            return self.url_manager.first_url()
        else:
            return self.url_manager.last_url()

    def remove_url(self, url):
        self.logger.info('remove url: %s', url)
        return self.url_manager.remove_url(url)

    @staticmethod
    def is_deleted_html(html, verbose=True):
        _ = '信息已经被删除' in html
        if _ and verbose:
            print('信息已经被删除页面')
        return _

    @staticmethod
    def is_block_html(html, verbose=True):
        _ = 'blocked_404' in html
        if _ and verbose:
            print('block页面')
        return _

    @staticmethod
    def is_check_html(html, verbose=True):
        _ = '访问验证-拉勾网' in html
        if _ and verbose:
            print('验证码页面')
        return _

    def save_html(self, url, html):
        self.logger.info('save html of url: %s', url)
        if html:
            self.col.update(
                {
                    '_id': self.incr_id.get(),
                    'url': url,
                },
                {
                    '$set': {'html': html}
                },
                upsert=True
            )

    def run(self):
        if not self.url_nums():
            self.add_url_list()

        self.update_headers()

        while self.url_nums() > 0:
            if self.sleep:
                time.sleep(self.sleep + random.randint(1, 5))

            url = self.next_url()
            if url is not None:
                r = self.get_response(url)
                if not r:
                    self.delay_url(url)
                    self.update_headers()
                    continue

                html = r.text
                if self.is_block_html(html) or self.is_check_html(html):
                    self.delay_url(url)
                    self.update_headers()
                    continue
                else:
                    self.save_html(url, html)
                    self.remove_url(url)
示例#9
0
class CheckKuaidailiCralwer(CheckXiciCralwer):
    db = get_db('htmldb')
    col = getattr(db, 'kuaidaili_proxy')    # collection
示例#10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import _env
from pprint import pprint as pp
from tornado.util import ObjectDict
from lib._db import get_db, redis_client as r
from html_parser import Bs4HtmlParser
from lagou_parser import LagouHtmlParser
from spider import LagouCrawler
from web_util import logged

DEBUG = True
db = get_db('htmldb')
lagou_html_col = getattr(db, 'lagou_html')  # collection


def test_get_db():
    o = col.find_one({'_id': 1234})
    html = o['html']
    # print html
    pp(o)


def test_get_html():
    import chardet
    _id = 46167
    o = col.find_one({'_id': _id})
    p = Bs4HtmlParser('', o['html'])
    print(p.html)
    t = p.bs.find('p', class_='msg')