def critical(self, msg, *args, **kwargs): try: from superbase.globalData import gConfig name = gConfig.get(CFG_JOB_NAME) batch = gConfig.get(CFG_JOB_BATCH) from superbase.utility.timeUtil import getTimestamp curTime = getTimestamp() def _createId(): lines = msg.split("\n") lines.reverse() for line in lines: if line.find("!myex!File ") >= 0 and line.find( " line ") > 0: return hash("%s_%s_%s_%s" % (curTime[:8], line, name, batch)) # or use hash_lib.md5 return None id = _createId() if id and id not in self.exceptions: #用db来保证多进程下log唯一性 ret = self.logDB(name, batch, msg, id) if ret != LOG_EXIST or msg.find(LOG_ALWAYS) >= 0: self.logger.critical( "{name} {batch} {msg}".format(name=name, batch=batch, msg=msg), *args, **kwargs) self.exceptions.append(id) elif gConfig.get("env") == "DEV": print("dev-debug:%s" % msg) except ImportError, e1: print("%s/n/%s" % (e1, sys.path))
def getProxyFromAPI(self): if gConfig.get(CFG_SERVER_PROXYAPI, "mogu") == "zhima": if self.currProxyInfo: self.giveBackProxy() return self.proxyFromDB() elif gConfig.get(CFG_SERVER_PROXYAPI, "mogu") == "mogu": retryNum = 0 maxTry = 10 while retryNum < maxTry: if not self.proxyList: res = requests.get(self.proxyServer) ck_json = res.content logInfo(ck_json) ck_dict = json.loads(ck_json) if ck_dict["code"] == "3001": time.sleep(5) if ck_dict["code"] == "0": self.proxyList = ck_dict["msg"] if self.proxyList: ck = self.proxyList.pop() ipPort = "http://{}:{}".format(ck["ip"], ck["port"]) if self.chekIP(ipPort): return ipPort retryNum += 1 time.sleep(2) logInfo("try to get proxies is disabled!!!!check out!!!")
def getResultByUrl(self, url, template, result): """ 获取请求页面的content数据 :param url: :param template: :param result: :return: 返回请求页面的content数据 """ content = self.http.get(url) # 对url发送get请求,获取内容 if content in (u' ', u'', None): return None # 获取配置参数CFG_DEBUG_SAVEFILE, if gConfig.get(CFG_DEBUG_SAVEFILE): AntiBlock.saveWrongPage( content, gConfig.get(CFG_DEBUG_SAVEFILENAME) ) # 如果有,,调用AntiBlock.saveWrongPage方法,记录错误页面的消息 format = gConfig.get(CFG_HTTP_OUTFORMAT, "html") if format == "json": # 返回格式是json,不用解析 result.update(content) # 更新 elif format == "html": self.getResultByContent(content, template, result) # 不符合条件,调用getResultByContent方法 elif format == "file": result["file"] = content return content
def createDb(dbNameKey, dbParams=None): """ 创建数据库 :param dbNameKey: 目前只有db.monitor数据监控 and db.business数据业务 :param dbParams: DEV and TEST 有默认参数,ONLINE需要通过jobManager分配 :return: """ from superbase.globalData import gConfig, gTop # CFG_DB_DISABLE 禁用,deprecated if gConfig.get(CFG_DB_DISABLE, 0): return # 获取数据监控或者数据业务 dbName = gConfig.get(dbNameKey) try: from superbase.globalData import gConfig db_params = AccountManager().getAccount( dbNameKey) if not dbParams else dbParams if not db_params: return db = createDb2(dbName, db_params, dictCursor=gConfig.get(CFG_DB_DICTCURSOR, 0)) # 把数据业务或者数据监控和mysql连接 配置到 全局数据单点控制 中 gTop.set(dbNameKey, db) except Exception: logException()
def __init__(self): #有jobId 才有jobEnable gConfig.set(CFG_JOB_ENABLE, gConfig.get(CFG_JOB_ID,0)) if gConfig.get(CFG_JOB_ENABLE, 0): # jobManger 任务管理器 from jobManager.job import Job self.job = Job() else: self.job = None self.jobError = None
def saveWrongPage(content2, htmlFile=None): import random if not htmlFile: htmlFile = gConfig.get(CFG_LOG_FILE_NAME).replace(".txt", "%s.html" % (random.randint(100, 999))) htmlFile = os.path.join(PROJECT_ROOT + "log/", htmlFile) fname = os.path.split(htmlFile)[1] import codecs with codecs.open(htmlFile, 'wb', gConfig.get(CFG_HTTP_ENCODING, "utf-8")) as f: f.write(content2) logInfo("saveWrongPage:%s" % htmlFile) return fname, htmlFile
def checkError(self): """ 检查取账号是否正常 :return: True有错 """ interval = gConfig.get("account.checkInterval", 300) # 检测区间300s limit = gConfig.get("account.checkLimit", 10) # 最大数10个 if len(self.history) > limit: diff = self.history[-1][0] - self.history[-10][0] if diff < interval: logError("getAccount too frequently!!-%s" % diff) return True return False
def getLogger(cfg, forceNew=False): from superbase.globalData import gTop if not gTop.get( GD_LOGGER) or forceNew: # singleton or force a new logger from superbase.globalData import gConfig from superbase.globalData import PROJECT_ROOT from superbase.utility.ioUtil import getPrintDict, mkdir logDir = os.path.join(PROJECT_ROOT, "log") mkdir(logDir) for key, value in cfg.items(): if key in IN_PARAMS_KEY: L1, L2, L3 = IN_PARAMS_KEY[key] BASIC_SETTINGS[L1][L2][L3] = value elif key == CFG_LOG_FILE_NAME: logFileName = os.path.join(logDir, value) dir = os.path.split(logFileName)[0] mkdir(dir) BASIC_SETTINGS["handlers"]["file"]["filename"] = logFileName logging.config.dictConfig(BASIC_SETTINGS) logger = logging.getLogger(SMILE_LOGGER) logger2 = logAdaper(logger) gTop.set(GD_LOGGER, logger2) # logger# hint = "current code root %s\n--config is--\n%s" % ( PROJECT_ROOT, getPrintDict(gConfig.cfg)) if gConfig.get(CFG_JOB_ID, 0) > 0: logger2.info(hint) else: logger2.debug(hint) return gTop.get(GD_LOGGER)
def saveSyncPoint(self, result,sync2Remote=False): """ 保存同步点 :param result: :param sync2Remote:默认每次都同步到remote :return:返回去掉syncInfo的数据 """ if self.index: try: index = self.index #如果result中 同步点信息 if CFG_DOWN_SYNCINFO in result: #把result中 已经同步的信息 赋值与syncInfo syncInfo = result.get(CFG_DOWN_SYNCINFO,{}) #删除result 中的同步点信息 del result[CFG_DOWN_SYNCINFO] data = { "id": md5(index), "idx": index, "syncInfo": syncInfo, "upTime": getTimestamp()#时间戳 } json2File(self.localFile,data) self.saveNum += 1 if sync2Remote or (self.saveNum%gConfig.get(CFG_DOWN_SYNCINTERVAL,5)==1): #默认每5次同步到remote: self.syncToRemote(data) except Exception, e: logException() return result
def __init__(self, params=None): # 添加前面两个配置只是为了调试方便 myCfg = { CFG_JOB_BATCH: "fanli_test20140717", CFG_JOB_NAME: "fanli", # CFG_HTTP_ENCODING:"gbk", CFG_HTTP_ENGINE: "selenium", CFG_HTTP_BROWSER: "chrome", #selenium默认用phantomjs做browser CFG_HTTP_BROWSERMODE: "1", #selenium默认用phantomjs做browser CFG_DOWN_INDEX: "www_fanli_com/coupon" } BaseCrawler.__init__(self, params, myCfg) resultFile = gConfig.get(CFG_LOG_FILE_NAME).replace( ".txt", "_result.txt") self.resultFile = os.path.join(PROJECT_ROOT + "log/", resultFile) self.file = open(self.resultFile, "w") import re self.urlPattern = re.compile(r"[url,go]=(http.*)") self.patterns = [ { "url": re.compile(r"url=(http.*)"), "sellerId": re.compile(r"seller_?[I,i]d%3D(.*?)%"), "couponId": re.compile(r"activity_?[I,i]d%3D(.*?)[&,%]"), "productId": re.compile(r"Epid-(.*?)%"), "discount": re.compile(ur"满(.*?)减(.*)"), }, { "url": re.compile(r"go=(http.*)"), # "sellerId": re.compile(r"[sellerId,seller_id]%3D(.*?)%"), "couponId": re.compile(r"activity_?[I,i]d%3D(.*?)[&,%]"), "productId": re.compile(r"itemId%3D(.*?)[&,%]"), "discount": re.compile(ur"(\d+)"), }, ]
def __init__(self, params=None, subConfigDict=None): """ :param params: 权限最高输入config,可覆盖所有,通常是命令行传入 :param subConfigDict: 权限次高config,可覆盖父类,通常是子类固定设置或者用于 :return: """ newCfg = params or subConfigDict if newCfg: # 统一配置访问入口,会整合global,class,and input,可以用gConfig 统一访问 # parseParams中转 把传递过来的str类型的配置 转换成dict类型的配置 configIn = self.parseParams(params) # input first if subConfigDict: # 如果subConfigDict非空 则把 处理好的配置参数 添加到subConfigDict subConfigDict.update(configIn) # subClass second else: subConfigDict = configIn # 把配置添加到 全局数据单点配置中 gTop.get(GD_CFG_IN).update(configIn) # 把子配置 更新到全局配置 gConfig.update(subConfigDict) # # 工作环境 # upper()返回转换为大写的字符串的副本。 gConfig.set("env", gConfig.get("env").upper()) # make sure capital # 创建日志记录器 createLogger(gConfig)
def getNewSyncInfoByDesc(self, oldSyncInfo, initBegin=None, initEnd=-365 * 10 * 24 * 3600 * 1000): """ 适用于同步信息是递减的情况,如以发帖时间为同步点 :param oldSyncInfo: 前一次的同步信息 :param initEnd: 第一次创建时用的结束时间,毫秒级别,默认是10年前 :return: """ syncInfo = oldSyncInfo if not initBegin:#默认就用当前时间 initBegin = time.time() * 1000 if not syncInfo: # 第一次抓取 syncInfo = { CFG_DOWN_INCPOINT: initBegin, # 下一次的增量起始点 CFG_DOWN_SYNCBEGIN: initBegin, # 开始 CFG_DOWN_SYNCCURR: initBegin, # 执行 CFG_DOWN_SYNCEND: initBegin + initEnd # 结束 } else: # 存量抓完了,或者没抓完但是放弃剩余存量,就用正常增量模式 if syncInfo[CFG_DOWN_SYNCCURR] <= syncInfo[CFG_DOWN_SYNCEND] or gConfig.get(CFG_DOWN_INCMODE, 0): syncInfo[CFG_DOWN_SYNCEND] = syncInfo[CFG_DOWN_INCPOINT] syncInfo[CFG_DOWN_INCPOINT] = syncInfo[CFG_DOWN_SYNCBEGIN] = syncInfo[CFG_DOWN_SYNCCURR] = initBegin else: # 上次没爬完,断点续爬 logInfo("use break point mode,go on crawling from the last break point") # syncInfo[CFG_DOWN_INCPOINT] = syncInfo[CFG_DOWN_SYNCBEGIN], syncInfo[CFG_DOWN_SYNCBEGIN] = syncInfo[CFG_DOWN_SYNCCURR] return self._getSyncInfoByCfg(syncInfo)
def update(self, table, params, condition=""): """ 更新数据 :param table: :param params: :param condition:条件 :return: """ str1 = [] values = [] for key, value in params.items(): # 把传过来的参数遍历出来 key与"="与valueHolder 拼接并添加到str1列表中,value添加到values列表中 str1.append(key + "=" + self.valueHolder) values.append(value) # 拼接字段 fields = ",".join(str1) # 更新的sql语句 sql = "update %s set %s %s" % (table, fields, condition) # 把values列表转换成values元组 values = tuple(values) # 执行 更新sql命令 cur = self.safeExecute(sql, values) self.commitTransaction(cur) from superbase.globalData import gConfig if gConfig.get("debug.sql", None): str1 = [] for key, value in params.items(): str1.append("`%s`='%s'" % (key, value)) params2 = ",".join(str1) sql = "update {table} set {params} {condition}".format( table=table, params=params2, condition=condition) # 打印sql 语句 logInfo(sql)
def antiBlockTest1(self): """ 反屏蔽测试,直接屏蔽信息测试 爬jobui,如果太频繁,有可能弹出登陆界面,或者显示反扒信息 :return: """ from spiderx.common.utility.antiBlockUtil import AntiBlockStrategy strategy = gConfig.get(CFG_AB_STRATEGY) if gConfig.get( CFG_AB_STRATEGY, None) else "postpone 2;changeAccount" #changeAccount 必须自己实现 class MyAntiBlockStrategy(AntiBlockStrategy): def __init__(self, strategy): AntiBlockStrategy.__init__(self, strategy) def changeAccount(self): logInfo("implement:change account,relogin,...") antiBlocksConf = [ # 测试直接屏蔽,这里只是做测试,所以blockInfo 是用正常的信息 { # url 标示key,*表示任何页面 'key': ['*'], # 可以有多个info,每个info对应一个strategy 'blockInfo': [{ "info": u"We're sorry but the page", "strategy": MyAntiBlockStrategy(strategy) }], }, ] # setp 1,添加antiBlock配置 self.addAntiBlock(antiBlocksConf) # 这个conf只是随便设置 conf1 = { "logo": CssElement("div.company-logo > a > img", "src"), "name": CssElement("#companyH1 > a") } # 测试直接屏蔽,出错后会自动重启session,默认重试10次 self.downOne( "http://www.jobui.com/cmp?keyword=%E4%B8%8A%E6%B5%B7%E4%BF%A1%E7%A4%BC", conf1) if self.antiBlock.isNeedExit(): logInfo(u"重试10次失败 \n 无法继续处理,人工干预") return
def __init__(self): if gConfig.get("env") == "ONLINE": params = AccountManager().getAccount(CFG_DB_BUSINESS) if params: self.db = createDb2("loginHelperDb", params, dictCursor=1) self.proxyServerList = AccountManager().getAccount(CFG_SERVER_PROXY) self.proxyServer = self.proxyServerList[2] self.proxyList = [] self.usedIP = 0 if gConfig.get(CFG_SERVER_PROXYAPI, None) == "zhima": self.website = { "www_tianyancha_com": "tyc", "www_qichacha_com": "qcc", } self.proxyId = 0 self.currWeb = self.website[gConfig.get(CFG_DOWN_WEBSITE)] self.currProxyInfo = None
def retry(self): self.blocked = 0 self.retryNum += 1 logInfo("blocked!!--retry-%s"%self.retryNum) if self.retryNum>gConfig.get(CFG_AB_MAXRETRY,2): logCritical("retry antiblcok Fail") self.retryNum = 0 self.setExit()
def getSavePath(self,params=None): hour = getHour(getTimestamp()) index = self.getDownIndex() # 设置文件的保存路径 rootPath = gConfig.get(CFG_DOWN_ROOT) path = os.path.join(rootPath, "downData/{index}/{hour}/{batch}".format(index=index, batch=gConfig.get(CFG_JOB_BATCH),hour=hour)) mkdir(path) return path
def __init__(self, params, subConfig=None): self.basicConfig = { # http down related CFG_HTTP_INTERVAL: 0.01, # 请求间隔 CFG_HTTP_TIMEOUT: 10, # CFG_HTTP_OUTFORMAT: 'html', # json CFG_HTTP_ENCODING: 'utf-8', # gbk CFG_HTTP_UNESCAPE: 0, # remove special character quoting CFG_HTTP_ENGINE: 'requests', # selenium CFG_HTTP_UA: 'windows', # mac,ios,android CFG_HTTP_BROWSERMODE: 'headless', # CFG_HTTP_BROWSER: BROWSER_TPE_PHANTOMJS, CFG_HTTP_MAXREQUEST: 0, # 一个session 最多请求次数,0表示无限制,否则超过这个次数将重启session CFG_JOB_RUNTIME: 0, # 爬虫运行时间,0无限制,单位是秒 CFG_JOB_HEARTBEAT: 60, # 任务心跳间隔,单位是秒 CFG_DOWN_MAXNUM: 0, # 一次爬虫最多下载数量,0无限制 CFG_DOWN_MAXPAGENUM: 0, # 一次爬虫最多下载页面数量,0无限制 CFG_BLOCK_MAXCHECK: 100, # 反block,元素检查,檢查次數默认>100次,就算blocked,也有可能是页面结构改变 CFG_ACCOUNT_PROVIDER: "spideraccount.samanager", } if subConfig: # 如果subConfig有配置 则更新基本配置 self.basicConfig.update(subConfig) # 把基本配置加入到全局配置中 并写入日志中 BaseClass.__init__(self, params, self.basicConfig) #依赖配置一定要紧跟在BaseClass后 if not gConfig.get(CFG_DOWN_ROOT, None): gConfig.set( CFG_DOWN_ROOT, "d:/" if gConfig.get("env") in ("ONLINE") and not isLinux() else PROJECT_ROOT) SpiderJobUtil.__init__(self) # 如果获取配置的引擎不是seleenium就用selenium请求,如果是就用requests请求 self.http = RequestsAgent() if gConfig.get( CFG_HTTP_ENGINE, "requests") != "selenium" else SeleniumAgent() # 获取全局编码格式 并用lxml 解析 self.parser = etree.HTMLParser(encoding=gConfig.get(CFG_HTTP_ENCODING)) # 把请求的方式和用什么编码解析 放到 下载内容提取器 self.extractor = Extractor(self.http, self.parser) self.antiBlock = None # 工作开始 self.syncPoint = self.jobBegin()
def isOnDevelop(): """ 在开发 :return: """ # local 本地 # upper 字符串中的小写字母转为大写字母。 return gConfig.get("env").upper() == "LOCAL"
def __init__(self, provider=None): """ :param provider: module or full name of module,eg, xxAccount.provider """ self.provider = gConfig.get(CFG_ACCOUNT_PROVIDER, provider) if isinstance(self.provider, basestring): import importlib self.provider = importlib.import_module(self.provider)
def __init__(self, index=None): """ :param index: """ self.index = gConfig.get(CFG_DOWN_INDEX) if not index else index if self.index: syncDir = os.path.join(PROJECT_ROOT,"syncDir") if self.index: path1 = os.path.join(syncDir,"%s.json"%self.index) mkdir(os.path.split(path1)[0]) self.localFile = path1 self.lastSyncInfo = None self.saveNum = 0 self.syncRemote = gConfig.get(CFG_JOB_ENABLE) or gConfig.get("debug.sync") self.checkSync() else: logError("no syncIndex!!") self.syncRemote = False
def doCheckBlock(self, url, content, antiBlock): blockInfo = antiBlock.get("blockInfo", None) if blockInfo: for b1 in blockInfo: info = b1["info"] #兼容 self.blocked = BLOCKED_INFO if content.find(info) > 0 else 0 if self.blocked: logError("!!!!block by %s,url=%s" % (gConfig.get(CFG_JOB_NAME), url)) return b1["strategy"] # check the elements blocked = False element = antiBlock.get("blockElement",None) if element: strategy = element.get("strategy",None) elements = element["elements"] for template, value in elements: result = {} self.extractor.getResultByContent(content, template, result) checkName = result.get("name", None) if not checkName or (value and checkName.find(value) == -1): blocked = True else: blocked = False break # 非block马上跳出 if blocked: self.blockCheck += 1 logError("%s:the element not exist,block?%s" % (self.blockCheck,url)) else: self.blockCheck = 0 # reset globalCheckNum = gConfig.get(CFG_BLOCK_MAXCHECK,30) localCheckNum = element.get("maxCheckNum",globalCheckNum) #如果有local,用local if self.blockCheck > localCheckNum: logError("block by element,pls check the content,maybe the structure has changed!") self.blocked = BLOCKED_ELEMENT self.blockCheck = 0 return strategy
def __init__(self, delAc=False): """ :param acType: cookie,webSite,... """ params = AccountManager().getAccount(CFG_DB_BUSINESS) self.db = createDb2("loginHelperDb", params, dictCursor=1) self.curMaxId = 0 self.offset = gConfig.get(CFG_DB_OFFSET, 0) self.history = [] self.delAc = delAc
def getDownIndex(self): """ :return: index,without the "/" at the begin and end 路径index,可以多级,逐级细分 """ idx = gConfig.get(CFG_DOWN_INDEX) if idx.endswith("/"): idx = idx[:-1] if idx.startswith("/"): idx = idx[1:] return idx
def alarmPageError(self, url, content, downInfo): """ 解析元素有错,有可能是blocked 也有可能是页面结构变化,邮件警告,人工检查 :param url: :param content: :param downInfo:downNum,downTime,downInterval etc. :return: """ fname, filePath = AntiBlock.saveWrongPage(content) info = { 'jobName': gConfig.get(CFG_JOB_NAME), 'batch': gConfig.get(CFG_JOB_BATCH), 'url': url, 'filePath': filePath, 'type': self.blocked, 'detail': json.dumps(downInfo), 'inTime': getTimestamp(), } title = "block-%s" % self.blocked content = getPrintDict(info) attach = [(fname, filePath)] emails2 = [gConfig.get(CFG_JOB_EMAIL)] if gConfig.get(CFG_JOB_EMAIL, None) else [] if gConfig.get(CFG_JOB_ENABLE, 0): gTop.get('db').insert("block", info) from jobManager.job import Job Job().sendEmail( title=title, content=content, attach=attach, emails2=emails2 ) else: Mail.sendEmail( title=title, content=content, t_address=emails2, attaches=attach ) logError("blocked?check the content\n%s" % getPrintDict(info))
def pageUrls(self, listConf): """ 是自动拼接url以&str= list url's generator :return: """ try: url = self.beginUrl totalPage, err = self.getPageNum(url, listConf) if err: logError("getPageNum error?%s,url=%s" % (err, url)) logInfo("%s url=%s\ntotalPage=%s" % (getTimestamp(), url, totalPage)) if int(gConfig.get(CFG_DOWN_MAXPAGENUM)): totalPage = min(int(totalPage), int(gConfig.get(CFG_DOWN_MAXPAGENUM))) for page in range(int(totalPage)): try: url2 = self.getNextPageUrl(url, page + 1, listConf) if self.http.isBlocked(): break yield url2 except Exception: logException() except Exception, e: logException()
def test1(self, val1, val2): """ 这里演示的是: 1,全局配置参数test.size的使用 2,脚本调用的case python superbase/sample1.py "env=DEV,test.size=101" test1 hello world python superbase/sample1.py "env=DEV,test.size=99" test1 hello world :return: """ size = gConfig.get("test.size", 0) if size > 100: logInfo("size=%s-%s" % (size, val1)) else: logInfo("size=%s-%s" % (size, val2))
def __init__(self, params=""): subCfg = { CFG_ACCOUNT_PROVIDER: "spideraccount.samanager", #ALIYUN_LOCALROOT: PROJECT_ROOT, # 正常这就是本地根 } import oss2 BaseClass.__init__(self, params, subCfg) aliyunCfg = AccountManager().getAccount(ALIYUN) accessKeyId = aliyunCfg["accessKeyId"] accessKeySecret = aliyunCfg["accessKeySecret"] endpoint = aliyunCfg["endPoint"] self.bucket = aliyunCfg["bucket"] self.oss = oss2.Bucket(oss2.Auth(accessKeyId, accessKeySecret), endpoint, aliyunCfg["bucket"]) self.prefix = gConfig.get(ALIYUN_LOCALROOT, self._getDefaultDownRoot()).replace( "\\", "/")
def exe(conn, sql, values): try: # 创建执行的游标 cur = conn.cursor() # 如果values列表 非空 if values: # 执行sql命令 cur.execute(sql, values) else: cur.execute(sql) if gConfig.get("debug.sql", 0): logDebug(cur._last_executed) # 把创建的游标 放到队列中 put 发送 出去 q.put(("ok", cur)) except Exception: err = traceback.format_exc() # 错误 放到队列中 put 发送 出去 q.put(("error", err))
def getLogFileName(batch, jobName): """ 获取日志文件的名字 :param batch: :param jobName: :return: jobLog and nodeLog """ from superbase.globalData import PROJECT_ROOT from superbase.globalData import gConfig # PROJECT_ROOT 项目根 logDir = PROJECT_ROOT + "log/" jobLog = None nodeLog = None if batch: # 工作日志 拼接 路径 jobLog = os.path.join(logDir, "%s/%s.txt" % (batch, jobName)) # 节点日志 nodeLog = os.path.join(logDir, gConfig.get(CFG_LOG_FILE_NAME)) return jobLog, nodeLog