예제 #1
0
import logging

from requests.packages.urllib3.connectionpool import log as requests_log
from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log

from msic import config
from msic.core.service import mongodb_service

selenium_log.setLevel(logging.WARNING)
requests_log.setLevel(logging.WARNING)

DATABASE_NAME = "guju"

# MongoDB
mongodb = mongodb_service.get_db(config.mongodb_client, DATABASE_NAME)

IMAGES_STORE = 'D:/scrapy'

USE_PROXY = False
예제 #2
0
import logging

import redis
from requests.packages.urllib3.connectionpool import log as requests_log
from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log

from common import mongodb_service

selenium_log.setLevel(logging.WARNING)
requests_log.setLevel(logging.ERROR)

DATABASE_NAME = "Suzhou_Bus"

# HOST
HOST = 'localhost'
PORT = 27017

# MongoDB
mongodb = mongodb_service.get_db(mongodb_service.get_client(HOST, PORT),
                                 DATABASE_NAME)

# phantomjs_path = 'D:\Development\python3\Scripts\phantomjs.exe'
phantomjs_path = 'C:\Flyn\Development\SDK\python3\Scripts\phantomjs.exe'

REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_DATABASE_NAME = 0

# Redis
redis_client = redis.StrictRedis(host=REDIS_HOST,
                                 port=REDIS_PORT,
예제 #3
0
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


requests_log.setLevel(logging.WARNING)  # 设置 requests 库的日志输出级别


class GlobalhospitalsDownloaderMiddleware(RandomProxy):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    def __init__(self):
        print('Test Site: {}'.format(settings['PROXY_TEST_SITE']))
        g = GetIp()
        g.get_proxy_ip()
        g.verify_proxies()
        with open(settings['PROXY_LIST'], 'w') as f:
            for proxy in g.proxy_list:
                f.write(proxy + '\n')
        super().__init__(settings)