示例#1
0
    def __init__(self):
        # 连接数据库
        try:
            engine = create_engine('mysql://%s:%s@%s:%s/%s?charset=utf8' %
                                   (cp.get('db', 'user'), cp.get('db', 'pwd'),
                                    cp.get('db', 'host'), cp.get(
                                        'db', 'port'), cp.get('db', 'dbname')),
                                   echo=False)

            DBSession = sessionmaker(bind=engine)
            self.metadata = MetaData(engine)
            self.db_session = DBSession()

        except Exception as e:
            raise e

        # 创建表
        create_table(engine)

        self.ipTool = IPProxyTool()
        self.ipTool.destIP = 'https://www.dianping.com'
        self.ipTool.refresh()

        self.thread = threading.Thread(target=self.schedule)
        self.thread.setDaemon(True)
        self.thread.start()
示例#2
0
class DianpingSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    def __init__(self):
        self.ipTool = IPProxyTool()
        super(DianpingSpiderMiddleware, self).__init__()

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        for r in start_requests:
            yield r

    def process_request(self, request, spider):
        request.meta['proxy'] = 'http://' + choice(
            self.ipTool.getIPs()) if len(self.ipTool.getIPs()) != 0 else None
        Logging.debug('{(request_url)%s : (proxy)%s}' %
                      (request.url, request.meta['proxy']))

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
示例#3
0
    def request_ticket_info(self):
        Logging.debug('<---------- 请求车票信息 ---------->')
        train_infos = []

        for url in self.request_ticket_urls:
            proxies = []
            for ip in IPProxyTool().getIPs():
                proxies.append(dict(http='http://' + ip, https='http://' + ip))

            print url
            if len(proxies) != 0:
                response = self.session.get(url,
                                            verify=False,
                                            proxies=choice(proxies))
            else:
                response = self.session.get(url, verify=False)
            try:
                if response.status_code == 200:
                    # Logging.debug(requests.utils.dict_from_cookiejar(self.session.cookies))
                    self.session.cookies.save(ignore_discard=True)
                    info_json = response.json()
                    result = info_json['data']['result']
                    train_infos.append(result)
                else:
                    Logging.warning(response.text.encode('u8'))
                    continue
            except Exception as e:
                Logging.debug(response.text)
                Logging.warning(e.message)
                continue
        return self.parse_result(train_infos)
示例#4
0
    def process_exception(self, request, exception, spider):
        if isinstance(exception,
                      self.EXCEPTIONS_TO_RETRY) and not request.meta.get(
                          'dont_retry', False):
            Logging.warning(
                '<---------- {(url)%s (retry_time):%s} ---------->' %
                (request.url, request.meta.get('retry_times', 0)))
            if request.meta.get('retry_times', 0) == 2:
                request.meta['proxy'] = None
            elif request.meta.get('retry_times', 0) == 3:
                f = open('fail_urls.txt', 'a+')
                f.write(request.url + '\n')
                f.close()
            else:
                request.meta['proxy'] = 'http://' + choice(
                    IPProxyTool().getIPs()) if len(
                        IPProxyTool().getIPs()) != 0 else None
                request = request.replace(
                    url=request.url.split('?')[0]) if len(
                        request.url.split('?')) > 0 else None

            return self._retry(request, exception, spider)
示例#5
0
 def request_proxy(self, min_ip_count=5):
     IPProxyTool().destIP = INITURL
     IPProxyTool().refresh(min_ip_count=min_ip_count)
示例#6
0
class DianpingPipeline(object):
    def __init__(self):
        # 连接数据库
        try:
            engine = create_engine('mysql://%s:%s@%s:%s/%s?charset=utf8' %
                                   (cp.get('db', 'user'), cp.get('db', 'pwd'),
                                    cp.get('db', 'host'), cp.get(
                                        'db', 'port'), cp.get('db', 'dbname')),
                                   echo=False)

            DBSession = sessionmaker(bind=engine)
            self.metadata = MetaData(engine)
            self.db_session = DBSession()

        except Exception as e:
            raise e

        # 创建表
        create_table(engine)

        self.ipTool = IPProxyTool()
        self.ipTool.destIP = 'https://www.dianping.com'
        self.ipTool.refresh()

        self.thread = threading.Thread(target=self.schedule)
        self.thread.setDaemon(True)
        self.thread.start()

    def process_item(self, item, spider):

        cuisines = item['cuisine']
        names = item['name']
        shop_url = item['shop_url']
        stars = item['star']
        comment_counts = item['comment_count']
        avg_prices = item['avg_price']
        districts = item['district']
        streets = item['street']
        update_times = item['update_time']

        for index in range(len(names)):
            model = DianPing(cuisine=cuisines[index],
                             shop_url=shop_url[index],
                             name=names[index],
                             star=stars[index],
                             comment_count=comment_counts[index],
                             avg_price=avg_prices[index],
                             district=districts[index],
                             street=streets[index],
                             update_time=update_times[index])
            self.db_session.add(model)
        self.db_session.commit()

        return item

    def close_spider(self, spider):
        self.db_session.close()

    def schedule(self):
        scheduler = BlockingScheduler()
        scheduler.add_job(self.ipTool.refresh, 'interval', seconds=300)
        scheduler.start()
示例#7
0
 def __init__(self):
     self.ipTool = IPProxyTool()
     super(DianpingSpiderMiddleware, self).__init__()