def test(self): df = self.fromOSS("downData/www_tianyancha_com/detail/company_1","2018022700-2018022800") def stat(record): """ record 就是每个json结果 :param record: :return: """ business_info = record.get("business_info ", {}) #数量 annualReport_Num = len(business_info.get("annualReport",[])) #是否为空,1,0 has_tianyan_risk = 1 if business_info.get("tianyan_risk",None) else 0 #长度 company_name_len = len(record.get("company_title", {}).get("company_name", "")) result = { "annualReport_Num":annualReport_Num, "has_tianyan_risk":has_tianyan_risk, "company_name_len":company_name_len } return result#(annualReport_Num,has_tianyan_risk,company_name_len) #TODO:series->list->df,应该可以直接优化成series->df data = df.apply(lambda x:stat(x.values[0][0]),axis=1).tolist() df2 = pd.DataFrame.from_dict(data) info = df2.describe() logDebug(""" #####################\n this is the stat sample,you can try it and save the resulut to db for further visualization\n #####################\n %s\n #####################\n """%info) result = info.to_dict()
def getPageNum(self, url, listConf): """翻页获取翻页页数的结果""" config = listConf # 不分页 TAG_LIST_TOTAL_PAGE_NUM总页数 if not config.get(TAG_LIST_TOTAL_PAGE_NUM, None): # 不分页 return (1, None) content = self.http.get(url) # 获取响应 if self.http.isBlocked(): # antiBlockUtil返回blocked的值或False return (0, ERR_BLOCK) if TAG_LIST_NO_RESULT in config and content.find(config[TAG_LIST_NO_RESULT]) > 0: logDebug("%s url=%s\n%s" % (getTimestamp(), url, config[TAG_LIST_NO_RESULT])) return (0, None) result = {} # 解析并且翻页的结果 Extractor(self.http, self.parser).getResultByContent(content, {TAG_LIST_TOTAL_PAGE_NUM: config[TAG_LIST_TOTAL_PAGE_NUM]}, result) # 优化了一下 # totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip()) if result[TAG_LIST_TOTAL_PAGE_NUM] != "": totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip()) else: totalPageNum = 1 # 此处改动 0 -->1 return (totalPageNum, None)
def listDir(self, dir1, timeRange=None): """ :param dir1:downData/www_tianyancha_com/detail/company_1 :param timeRange: fmt:2018020100-2018020200 :return: """ objects = [] tryTime = 0 # def getNameTS(name): m = re.search(r"(\d{4})/(\d{2})/(\d{2})/(\d{2})", name) return "%s%s%s%s" % (m.group(1), m.group(2), m.group(3), m.group(4)) while tryTime < 3: try: dir1 = preProcessDir(dir1) # 列出bucket中”fun/”目录下所有文件 beginTime, endTime = timeRange.split("-") if timeRange else ( None, None) import oss2 for idx, object_info in enumerate( oss2.ObjectIterator(self.oss, prefix=dir1)): if beginTime and endTime: ts = getNameTS(object_info.key) if ts < beginTime or ts >= endTime: # logDebug("ignore:%s"%object_info.key) continue objects.append(object_info.key) logDebug("%s:%s" % (idx, object_info.key)) return objects except Exception, e: logException() tryTime += 1 time.sleep(1)
def downLists(self, listConf, listItemConf, resultHandlerClass, urlMgr): """ :param listConf: 列表配置 :param listItemConf: 列表项配置 :param resultHandlerClass: 结果处理类 :param urlMgr: urlManager 提供url :return: """ # 打印时间戳 logInfo("%s_begin downLists" % (getTimestamp())) err = num = 0 # url跳页 处理 for url in urlMgr.pageUrls(listConf): try: # debug 打印url logDebug(url) # 中转 获取原网页的源码content 并交给 downOneList2 处理 self.downOneList(url, listConf, listItemConf, resultHandlerClass()) num += 1 # 检查下载状态 err = self.checkDownStatus(num) if IS_ERROR(err): # 如果err<0 break break except Exception: logException() return err
def testReadAliyun(self): """ :return: """ from superbase.utility.aliyun import AliYun lines = AliYun().readJsonFromAliyun('downData/www_qichacha_com') for line in lines: logDebug(line['name'])
def asyncRun(cmd, shell=True): try: logDebug("asyncRun:%s" % cmd) subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=shell) except Exception, e: logException()
def printDict(dict, log=None): """ 输出dict形式 :param dict: 字典的数据 :param log: 是否写入日志中 :return: """ info = json.dumps(dict, indent=4, ensure_ascii=False) if info: if not log: from superbase.utility.logUtil import logDebug logDebug(info) else: log(info)
def exe(conn, sql, values): try: # 创建执行的游标 cur = conn.cursor() # 如果values列表 非空 if values: # 执行sql命令 cur.execute(sql, values) else: cur.execute(sql) if gConfig.get("debug.sql", 0): logDebug(cur._last_executed) # 把创建的游标 放到队列中 put 发送 出去 q.put(("ok", cur)) except Exception: err = traceback.format_exc() # 错误 放到队列中 put 发送 出去 q.put(("error", err))
def getMissedBatch(self): db = gTop.get(CFG_DB_MONITOR) batches = [ batch[0] for batch in db.query( "select batch from job where name='tycdetail_fetcher' and status=100" ) ] batches2 = [ batch[0] for batch in db.query( "select batch from batch where batch like 'tycdetail_%' and closed=0" ) ] result = list(set(batches) - set(batches2)) if result: sql = "select id, name,status,batch,beginTime from job where batch in ('%s')" % ( "','".join(result)) logDebug(sql) for id, name, status, batch, beginTime in db.query(sql): if status != 100: db.update("job", {"status": 2}, "where id=%s" % id) for batch2 in result: db.update("batch", {"closed": 0}, "where batch='%s'" % batch2)
def updateResume2(self): """ python spider/sample/fanli.py update.time=600 updateResume2 :return: """ self.http.get("https://www.lagou.com/") driver = self.http.driver e = findElement(driver, By.XPATH, '//*[@id="changeCityBox"]/ul/li[2]/a') if e: #弹出地域选择 e.click() driver.find_element_by_xpath( '//*[@id="lg_tbar"]/div/ul/li[1]/a').click() driver.find_element_by_xpath( '/html/body/section/div[1]/div[1]/ul/li[1]').click() driver.find_element_by_xpath( '/html/body/section/div[1]/div[2]/form/div[1]/input').send_keys( "*****@*****.**") driver.find_element_by_xpath( '/html/body/section/div[1]/div[2]/form/div[2]/input').send_keys( "pwdh8f_lagou") driver.find_element_by_xpath( '/html/body/section/div[1]/div[2]/form/div[5]/input').click() time.sleep(10) num = 0 while num < 300000: driver.find_element_by_xpath( '//*[@id="lg_tbar"]/div/ul/li[2]/a').click() #简历 time.sleep(5) driver.find_element_by_xpath( '//*[@id="workExperience"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div/em' ).click() #edit time.sleep(5) driver.find_element_by_xpath( '//*[@id="currentUpJobForm"]/div/div[6]/input').click() #edit time.sleep(gConfig.get("update.time", 300)) logDebug("update %s" % num) num += 1
def downOneList2(self, url, content, listConf, listItemConf, resultHandler): """ downOneList 的具体实现 :param url: 只是起到log作用 :param content: 页面内容 :param listConf: 列表配置 :param listItemConf: 列表项配置 :param resultHandler: 结果的handler :return: error:-1,ok:0 """ # pq(etree.parse())直接接受一个文档,按照文档结构解析 # StringIO经常被用来作字符串的缓存,因为StringIO的一些接口和文件操作是一致的, # 同样的代码,可以同时当成文件操作或者StringIO操作。 # getroot 获取原网页的根 root = pq(etree.parse(StringIO(content), self.parser).getroot()) # list 行数组的模式 css = listConf[TAG_LIST_ITEMS] trs = root(css) if trs and len(trs) > 0: for idx, tr in enumerate(trs): # enumerate 列举 try: result = {} # 把提取的原网页内容 以行组模式 css选择器 为匹配方式 并以dict形式 保存到result中 self.extractor.getResult(pq(tr), listItemConf, result) # debug输出 logDebug(getPrintDict(result)) # 输出 -->BaseResultHandler().handle(result) resultHandler.handle(result) except Exception: logException() else: # 没有这个lsits 并打印错误的url logError("%s !no lists" % url) return -1 return 0
def updateResume(self): """ python spider/sample/fanli.py update.time=600 updateResume :return: """ while True: self.http.get("https://www.liepin.com/sh/") driver = self.http.driver driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[2]/form/div[3]/p/a' ).click() time.sleep(5) driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[1]/input' ).send_keys("*****@*****.**") driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[2]/input' ).send_keys("pwdh8f_liepin") driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/input[3]' ).click() time.sleep(10) num = 0 while num < 300000: e = findElement( driver, By.XPATH, '// *[ @ id = "home"] / div[3] / div[2] / div[1] / div[4] / ul / li[1] / a' ) if e: e.click() time.sleep(gConfig.get("update.time", 300)) self.http.get("https://c.liepin.com/") logDebug("update %s" % num) num += 1 else: logError("try again!") break
if (os.path.isdir(d) == True): deleteNullDir(d) if not os.listdir(dirr): info = 'del empty dir: %s' % dirr try: os.rmdir(dirr) logInfo("done:%s" % info) except Exception, e: logException("fail-%s" % info) ds = list(os.walk(root)) # 获得所有文件夹的信息列表 stopTime = time.time() - before for d in ds: # 遍历该列表 try: from superbase.utility.logUtil import logDebug logDebug("processing-%s" % d[0]) os.chdir(d[0]) # 进入本级路径,防止找不到文件而报错 if d[2] != []: # 如果该路径下有文件 for x in d[2]: # 遍历这些文件 try: lastmodifytime = os.stat(x).st_mtime # 获取文件创建时间 if lastmodifytime < stopTime: try: info = "del-%s/%s" % (d[0], x) os.remove(x) # 删掉 logInfo("done:%s" % info) except Exception, e: logException("fail:%s" % info) except Exception, e: logException() except Exception, e:
def runProcess(cmd, outInfo=None, maxOutInfoNum=1000, debug=False, redirect=False, exitInfo=None): """ 运行多进程 :param cmd: :param outInfo: 输出的console信息list :param log: 可定制的logger :param maxOutInfoNum: 最多输出的console 信息行数 :param debug: debug模式只是输出命令行 :param redirectFile: 是否用重定向文件模式 :param ,exitInfo: 遇到该消息退出 :return: """ # cmd += "\n" #what the hell use it? from superbase.utility.logUtil import logInfo try: if redirect: idx = cmd.rfind(">") if idx > 0: # 判断是否需要重定向,重定向必须是绝对路径 outfile = cmd[idx + 1:].strip() outfile = os.path.abspath(outfile) logInfo("redirect-file=%s" % outfile) dir1 = os.path.dirname(outfile) from superbase.utility.ioUtil import mkdir mkdir(dir1) redirectFile = open(outfile, "w") cmd = cmd[:idx] else: redirectFile = None logDebug("\n%s the cmd is %s\n" % (timeUtil.getCurTime(), cmd)) if debug: return p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) lineNum = 0 while True: line = p.stdout.readline() if not line: break if exitInfo and line.find(exitInfo) >= 0: break # log.debug(line) if (outInfo != None): outInfo.append(line) lineNum += 1 if maxOutInfoNum > 0 and lineNum > maxOutInfoNum: del outInfo[:-1] lineNum = 0 if redirectFile: redirectFile.flush() if redirectFile: redirectFile.write(line) if redirectFile: redirectFile.close() logDebug("process-done:%s" % cmd) except Exception: from superbase.utility.logUtil import logException logException() return outInfo
def showProcess(self, exe="python"): logDebug(showProcess(exe))