Пример #1
0
 def __init__(self, params=None):
     # 添加这两个配置只是为了调试方便
     myCfg = {
         # CFG_JOB_BATCH:"split_test20140717",
         # CFG_JOB_NAME:"split",
         "env": "DEV"
     }
     BaseClass.__init__(self, params, myCfg)
Пример #2
0
 def __init__(self, params=""):
     subCfg = {
         CFG_ACCOUNT_PROVIDER: "spideraccount.samanager",
         #ALIYUN_LOCALROOT: PROJECT_ROOT,  # 正常这就是本地根
     }
     import oss2
     BaseClass.__init__(self, params, subCfg)
     aliyunCfg = AccountManager().getAccount(ALIYUN)
     accessKeyId = aliyunCfg["accessKeyId"]
     accessKeySecret = aliyunCfg["accessKeySecret"]
     endpoint = aliyunCfg["endPoint"]
     self.bucket = aliyunCfg["bucket"]
     self.oss = oss2.Bucket(oss2.Auth(accessKeyId, accessKeySecret),
                            endpoint, aliyunCfg["bucket"])
     self.prefix = gConfig.get(ALIYUN_LOCALROOT,
                               self._getDefaultDownRoot()).replace(
                                   "\\", "/")
Пример #3
0
    def __init__(self, params, subConfig=None):
        self.basicConfig = {
            # http down related
            CFG_HTTP_INTERVAL: 0.01,  # 请求间隔
            CFG_HTTP_TIMEOUT: 10,  #
            CFG_HTTP_OUTFORMAT: 'html',  # json
            CFG_HTTP_ENCODING: 'utf-8',  # gbk
            CFG_HTTP_UNESCAPE: 0,  # remove special character quoting
            CFG_HTTP_ENGINE: 'requests',  # selenium
            CFG_HTTP_UA: 'windows',  # mac,ios,android
            CFG_HTTP_BROWSERMODE: 'headless',  #
            CFG_HTTP_BROWSER: BROWSER_TPE_PHANTOMJS,
            CFG_HTTP_MAXREQUEST:
            0,  # 一个session 最多请求次数,0表示无限制,否则超过这个次数将重启session
            CFG_JOB_RUNTIME: 0,  # 爬虫运行时间,0无限制,单位是秒
            CFG_JOB_HEARTBEAT: 60,  # 任务心跳间隔,单位是秒
            CFG_DOWN_MAXNUM: 0,  # 一次爬虫最多下载数量,0无限制
            CFG_DOWN_MAXPAGENUM: 0,  # 一次爬虫最多下载页面数量,0无限制
            CFG_BLOCK_MAXCHECK:
            100,  # 反block,元素检查,檢查次數默认>100次,就算blocked,也有可能是页面结构改变
            CFG_ACCOUNT_PROVIDER: "spideraccount.samanager",
        }

        if subConfig:
            # 如果subConfig有配置 则更新基本配置
            self.basicConfig.update(subConfig)
        # 把基本配置加入到全局配置中 并写入日志中
        BaseClass.__init__(self, params, self.basicConfig)
        #依赖配置一定要紧跟在BaseClass后
        if not gConfig.get(CFG_DOWN_ROOT, None):
            gConfig.set(
                CFG_DOWN_ROOT, "d:/" if gConfig.get("env") in ("ONLINE")
                and not isLinux() else PROJECT_ROOT)
        SpiderJobUtil.__init__(self)

        # 如果获取配置的引擎不是seleenium就用selenium请求,如果是就用requests请求
        self.http = RequestsAgent() if gConfig.get(
            CFG_HTTP_ENGINE, "requests") != "selenium" else SeleniumAgent()
        # 获取全局编码格式 并用lxml 解析
        self.parser = etree.HTMLParser(encoding=gConfig.get(CFG_HTTP_ENCODING))
        # 把请求的方式和用什么编码解析 放到 下载内容提取器
        self.extractor = Extractor(self.http, self.parser)
        self.antiBlock = None

        # 工作开始
        self.syncPoint = self.jobBegin()
Пример #4
0
 def __init__(self, params="", cfg=None):
     BaseClass.__init__(self, params, cfg)
Пример #5
0
 def __init__(self,params=""):
     BaseClass.__init__(self,params)
Пример #6
0
 def __init__(self, params=None, subCfg=None):
     BaseClass.__init__(self, params, subCfg)
     JobUtil.__init__(self)
Пример #7
0
 def __init__(self):
     BaseClass.__init__(self)
     self.sqlite = Sqlite()
     # 创建本地数据库
     self.createLocalDb()
Пример #8
0
    def __init__(self, params=""):

        BaseClass.__init__(self, params)
        gConfig.set(CFG_ACCOUNT_PROVIDER, "spideraccount.samanager")