def __downlowdFile(self, url, req): # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way """ 下载文件的代码逻辑.沿用王洋逻辑,还没调试 :param url: :param req: """ reqheaders = req.headers revealfn = url.split('/')[-1] if "." in revealfn[-6:]: fileName = revealfn else: if ('Content-Disposition' in reqheaders.keys()): fileName = reqheaders['Content-Disposition'].split('filename=')[1] fileName = fileName.replace('"', '').replace("'", "") else: r = urllib.request.urlopen(url) if r.url != url: fileName = basename(urlsplit(r.url)[2]) self.URL_inf.FileName = fileName _FileName = None if (self.URL_inf.FilePath): _FileName = str(self.URL_inf.FilePath) + fileName else: _FileName = fileName with open(_FileName, "wb") as donefile: for chunk in req.iter_content(chunk_size=1024): if chunk: donefile.write(chunk) Log.i("File:"+_FileName+"downLoaded")
def run(self): ''' 分发 :return: 无 ''' Log.i('Pipeline.run()') if self.task['results'] is not None and len(self.task['results']) > 0: #下次任务入队列 if self.task['next_tasks'] is not None: for next_task in self.task['next_tasks']: self.taskUtil.insert_one(next_task) #本次解析结果入库 # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数 if hasattr(self, 'pipeline_' + self.task['parser']): func = getattr(self, 'pipeline_' + self.task['parser']) func(self.task['table']) else: self.pipeline_default(self.task['table']) #将完整task存入mongo,并将本条task self.task['state'] = 'done' self.taskUtil.replace_one(self.task['_id'], self.task) else: #没有解析出结果,则表示中间出错了,等待下次再启动 pass Log.i('this task is finished')
def find_one_and_replace(self, collection_name, filter_dict, replace_dict, upsert=False, auto_uptime=True): """ 查找并更新表记录,默认返回false,保证原子性 :param collection_name: str 集合名 :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}} :param update_dict: dict 更新的字段,如{'$set':{status_key:0,'campaign.status':1},{'$unset':'campaign.name':'test_camp'}} :param insert: bool 如果需要更新的记录不存在是否插入 :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有 :return: Document 更新成功后的文档 """ result = None try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] replace_dict['uptime']=uptime replace_dict['uptimestamp'] = uptimestamp collection = self.database.get_collection(collection_name) document=collection.find_one_and_replace(filter_dict, replace_dict, upsert=upsert,return_document=ReturnDocument.AFTER) result = document if result is None: Log.i("[INFO] find and update nothing!") else: Log.d("[INFO] find and update success!") except Exception as e: Log.e('find and update failed: %s' % e) finally: return result
def __init__(self, host, port, db_name, default_collection): Log.i('Init MongoDB') self.client = pymongo.MongoClient( host=host, port=port, connect=False ) # Connection() 和 MongoClient() safe MongoClient被设计成线程安全、可以被多线程共享的 self.db = self.client.get_database(db_name) self.collection = self.db.get_collection(default_collection)
def run(self): ''' 每隔1秒,循环读取tasks 交给Downloader :return: ''' #创建进程池 pool = Pool() while True: #获取一条待执行的Task,并置为doing状态 task = self.taskUtil.get_ready() if task is not None and len(task) > 0 or True: Log.i('-----------------------------') #用进程池启动Downloader pool.apply_async(self.run_downloader, args=(task, )) #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('scheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('Start sleep ' + str(seconds) + ' seconds') time.sleep(seconds) pool.close() pool.join() log.i('All subprocesses done.')
def run(): if sys.argv[1] is None and sys.argv[2] is None and sys.argv[3] is None: Log.i("no params error") os._exit(0) else: isCrontab = sys.argv[1] begin = sys.argv[2] end = sys.argv[3] #是否crontab命令启动,1代表是,其它代表否 if isCrontab==str(1): crontab = 1 else: crontab = 0 p_list = list() producerProcess = ProducerUrl("producer", crontab, begin, end) p_list.append(producerProcess) # start = TimeUtil.getDefaultTimeIt() for p in p_list: p.daemon = True p.start() for p in p_list: p.join() # end = TimeUtil.getDefaultTimeIt() # Log.i('ProducerUrlParentPid run for %.2fm' %(end - start)) if crontab==1: os._exit(0)
def consumerurl(self): """ 消费父链接 :param queueDictData: """ conf = localKafka_setting self.consumer.subscribe((conf['topic_name'])) # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了) for message in self.consumer: jsondata = str(message.value, "utf-8") Log.i(jsondata)
def _sendChildUrl(self,URL_inf, mutex_lock): # # 保存数据并提取子链接重新投入生产对应的话题 KafkaOperator = kafkaUrlinformation() # TODO 这里用类管理不同网站的逻辑 parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator) ccgpChildrenLink = parseCCGPModule.getLinks() if ccgpChildrenLink is None: mutex_lock.release() # 临界区结束,互斥的结束 return for link in ccgpChildrenLink: #于浩说不要发父链接给他 if link.DeepNum >= 0: Log.i("produce<<"+json.dumps(link.class2dict())) KafkaOperator.producerUrl(json.dumps(link.class2dict()))
def run(self): try: pass except Exception as e: Log.i("AsyncThreadScanner run exception<<" + e.message) # 移除线程队列 AsyncThreadScanner.lck.acquire() AsyncThreadScanner.tList.remove(self) # 如果移除此完成的队列线程数刚好达到上限数值-1,则说明有线程在等待执行,那么我们释放event,让等待事件执行 if len(AsyncThreadScanner.tList) == AsyncThreadScanner.maxThreads - 1: AsyncThreadScanner.event.set() AsyncThreadScanner.event.clear() AsyncThreadScanner.lck.release()
def run(self): ''' 获取免费IP代理进程执行,循环读取tasks :return: ''' Log.i('Downloader.run() in {0}'.format(time.ctime())) p_list = list() downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,)) p_list.append(downloaderRun) for p in p_list: p.daemon = True p.start() for p in p_list: p.join()
def run(self): ''' 线程执行,默认调用方法,任务分发 :return: ''' Log.i('Downloader.run()') #利用反射机制自动执行download_<parser名>()函数,如果找不到则执行默认的download_default()函数 if hasattr(self, 'download_' + self.task['parser']): func = getattr(self, 'download_' + self.task['parser']) func() else: self.download_default() #启动解析器 parserModule = Setting.PARSER_MODULE ParserX = importlib.import_module(parserModule) parser = ParserX.Parser(self.task) parser.run()
def run(self): ''' 分发 :return: ''' Log.i('Parser.run()') # 利用反射机制自动执行parse_<parser名>()函数,如果找不到则执行默认的parse_default()函数 if hasattr(self, 'parse_' + self.task['parser']): func = getattr(self, 'parse_' + self.task['parser']) func() else: self.parse_default() #启动Pipeline pipelineModule = Setting.PIPELINE_MODULE PipelineX = importlib.import_module(pipelineModule) pipeline = PipelineX.Pipeline(self.task) pipeline.run()
def update_proxy(): """ 获取并校验代理ip地址 :return: """ if USE_PROXY: i = 0 while True: try: get_proxy() notify_ip_address() return True except Exception: i += 1 Log.e("代理获取失败,尝试重试,重试次数%s" % (i, )) else: Log.i('notify address') notify_ip_address()
def run(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #监听数据 DictData = self.pipeDictData.recv() if DictData is None: continue #源数据处理(实体类) self.URL_inf.dict2class(DictData) #检查Mongo if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator( DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"], DbdataCCGPDFZB["db_name"], DbdataCCGPDFZB["default_collection"]) #检查Kafka if self.KafkaOperator is None: self.KafkaOperator = localKafkaUrlinformation() #查重 uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title) item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 # #TODO 插入数据库有问题 if value is not None: continue #获取首页内容 self.URL_inf = self.downLoadHtml() if self.URL_inf is None: continue #异步保存数据 self.savedata(self.URL_inf) # #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('consumerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def get_proxy(): """ 获取代理ip,并更新控制器PROXIES :return: 可用的ip代理 """ if USE_PROXY is False: return None try: Log.i('获取代理···') resp = requests.get(PROXY_URL, timeout=TIMEOUT) ip_address = resp.text proxies = {'http': ip_address, 'https': ip_address} # Log.i(proxies) PROXIES = proxies return PROXIES except Exception as e: Log.e('无法获取代理信息,请确认代理系统是否启动') return None
def savedata(self): """ 保存数据到mongodb """ uuid = self.get_md5() item = { "uuid": uuid, "url": self.Url_inf.Urlname, "title": self.Url_inf.title, "time": datetime.now().timestamp(), "lastTime": self.Url_inf.LastTime, "content": self.Url_inf.content, "fatherUrl": self.Url_inf.fatherUrl, "province": self.Url_inf.province } string = self.Url_inf.domain.replace('.', '').replace('/', '').replace(':', '') # 查重 删除 替换 if USE_SOURCEURL_TYPE is True: if self.Url_inf.province is not None and self.Url_inf.content is not None: value = self.myDb.findone({"uuid": uuid}) # 查询到返回文档 if value is None: Log.i(self.Url_inf.content.decode('utf-8')) self.myDb.insert(item, string) self.myDb.ensure_index("uuid", string) self.KafkaOperator.producterUUID( json.dumps({ "uuid": uuid, 'collection': string })) else: self.myDb.insert(item, string) self.myDb.ensure_index("uuid", string) self.KafkaOperator.producterUUID( json.dumps({ "uuid": uuid, 'collection': string }))
def run(self): ''' 获取免费IP代理进程执行,循环读取tasks :return: ''' Log.i('proxyIpPool.run() in {0}'.format(time.ctime())) while True: #调用本地和远程的免费ip代理api并推进ip消息队列 proxyIpPool = getIpProxyPool() #统一改成本地 proxyIpPoolFromeRemote = getIpProxyPool() # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote() if proxyIpPool is not None: self.queueDictData.put(proxyIpPool) if proxyIpPoolFromeRemote is not None: self.queueDictData.put(proxyIpPoolFromeRemote) # 休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('proxyIpScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('proxyIpPool sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def run(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i ('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #生产URL if USE_SOURCEURL_TYPE is True: if USE_ASYNCTASK_TYPE is True: urlInformationList = ConfigUtil.readSourceListByParams(self.begin, self.end) else: urlInformationList = ConfigUtil.readSourceList() else: urlInformationList = ConfigUtil.readTaskList() if urlInformationList is None: continue for urlInfor in urlInformationList: data = urlInfor.class2dict() diststrjson = json.dumps(data) Log.i(diststrjson) KafkaOperator = kafkaUrlinformation() KafkaOperator.producerUrl(diststrjson) #日执行一次不用休眠了.使用crontab定时任务驱动 if self.crontab==1: os._exit(0) else: # #休眠n秒(从配置文件中读取) items=ConfigUtil.getItems('producerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds=random.randint(int(interval_min),int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def simpleRun(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #资源检查 # KafkaOperator = kafkaUrlinformation() KafkaOperator = localKafkaUrlinformation() # if self.mogodbControl is None: # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], # Dbdata["default_collection"]) #解析数据源 # if USE_SOURCEURL_TYPE is True: # if USE_ASYNCTASK_TYPE is True: # urlInformationList = ConfigUtil.readSourceListRealTime() # else: # urlInformationList = ConfigUtil.readSourceList() # else: # urlInformationList = ConfigUtil.readTaskList() urlInformationList = ConfigUtil.readSourceListRealTime() #爬取,解析子URL if urlInformationList is None: continue for urlInfor in urlInformationList: data = urlInfor.class2dict() #获取首页内容 dowloadData = self.downLoadHtml(data) if dowloadData is None: continue # 解析提取分页url pageData = self.getPageNumFromHome(dowloadData) if pageData is None: continue for pageIndex in pageData: # 获取首页内容 dowloadPageData = self.downLoadHtml(pageIndex.class2dict()) if dowloadPageData is None: continue #提取子链接 # self.URL_inf.dict2class(pageIndex) ccgpChildrenLink = self.getChildrenLink(dowloadPageData) if ccgpChildrenLink is None: continue #KAFKA下发子链接 for link in ccgpChildrenLink: # 检查Mongo if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator( DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"], DbdataCCGPDFZB["db_name"], DbdataCCGPDFZB["default_collection"]) # 查重,不重复发送到kafka节省资源 if link.title is None: #标题为空的不发送 continue uuid = self.get_md5(link.Urlname, link.title) item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 # #TODO 插入数据库有问题 if value is not None: #数据库查重 continue # 于浩说不要发父链接给他 if link.DeepNum >= 0: producerData = json.dumps(link.class2dict()) Log.i("produce<<" + producerData) KafkaOperator.producerUrl(producerData) #日执行一次不用休眠了.使用crontab定时任务驱动 if self.crontab == 1: os._exit(0) else: # #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('producerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def getPageNumFromHome(self, dowloadData): """ 获取分页的页码URL """ if dowloadData['soup'] is None: return [] else: # Log.i(dowloadData['content'].decode('utf-8')) selector = etree.HTML(dowloadData['content'].decode('utf-8')) try: page = (int( selector.xpath( '//div[@class="vT_z"]/div[1]/div/p[1]/span[2]/text()') [0]) // 20) + 3 except: return [] if page is None: return [] parentURL_infor = [] #随机数的方法判断倒序 num = random.randint(3, 7) #存放url tempUrl = '' if (num % 2) == 0: for i in range(1, page): #TODO字符串替换有问题 #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' # x = 'page_index=' + str(i) # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) #TODO 这里拼接数据有问题 # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \ # + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\ # +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' x = 'page_index=' + str(i) tempUrl = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) Log.i("parseUrl<<" + tempUrl) urlChildInfo = URLinformation( Urlname=tempUrl, title=dowloadData['title'], DeepNum=dowloadData['DeepNum'], domain=dowloadData['domain'], fatherUrl=dowloadData['fatherUrl']) parentURL_infor.append(urlChildInfo) else: if parentURL_infor is not None: page = 0 return parentURL_infor else: for i in range(page - 1, 0, -1): #TODO字符串替换有问题 #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' # x = 'page_index=' + str(i) # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \ # + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\ # +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' x = 'page_index=' + str(i) tempUrl = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) Log.i("parseUrl<<" + tempUrl) urlChildInfo = URLinformation( Urlname=tempUrl, title=dowloadData['title'], DeepNum=dowloadData['DeepNum'], domain=dowloadData['domain'], fatherUrl=dowloadData['fatherUrl']) parentURL_infor.append(urlChildInfo) else: if parentURL_infor is not None: page = 0 return parentURL_infor
def getChildrenLink(self, pageIndex): """ 获取子链接 :return: """ pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*' pattern = re.compile(pattern) # print("domain" + str(self.Url_inf.Urlname)) Keyvalue = pattern.search(pageIndex['Urlname']) # Keyvalue <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'> # print("Keyvalue " + str(Keyvalue)) # print(self.Url_inf.Urlname) if Keyvalue != None: Keyvalue = Keyvalue.group() else: Keyvalue = domain = urlparse( pageIndex['Urlname']).scheme + "://" + urlparse( pageIndex['Urlname']).netloc domain = Keyvalue URL_infor = [] URL_infor2 = [] Links = [] link2 = '' title = '' currentTime = '' total_title = '' # if self.Url_inf.soup == None: # return [] if USE_BXBLS is True: #分成两个业务 # if self.Url_inf.Urlname.find("zygg"): # ul_content = self.Url_inf.soup.select(".c_list_bid")[0] # elif self.Url_inf.Urlname.find("dfgg"): # ul_content = self.Url_inf.soup.select(".c_list_bid")[0] # else: # ul_content = self.Url_inf.soup if pageIndex['soup'] is None: return [] else: urlInfoList = pageIndex['soup'].select( ".vT-srch-result-list-bid") if urlInfoList is None: return [] if urlInfoList: ul_content = urlInfoList[0] else: return [] for li in ul_content.select("li"): link = li.select("a")[0] # emProvince = li.select("span")[2].get_text() spanProvince = li.select("span")[0] emProvince = spanProvince.select("a")[0].get_text() currentTime = time.time() try: href2 = link['href'] total_title = link['title'] except KeyError: pageIndex['soup'].select("a").remove(link) # else: if ( href2.startswith("/") ): # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False # link2 = urljoin(self.Url_inf.Urlname, href2) # print(str(link2)) # link2=self.Url_inf.Urlname+href2 title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') elif (href2.startswith("../../..")): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('../../..',domain) elif href2.startswith(".."): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('..',domain) elif href2.startswith("./"): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('./',domain+'/') elif 'http' in href2 and 'gov' in href2: title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2 link2 = urljoin(pageIndex['Urlname'], href2) # print("link2 is :" + str(link2)) #title不全的问题 if title.find("...") > -1: title = total_title title = title.strip('\r') myLinkUrl = URLinformation(Urlname=link2, title=title, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname'], province=emProvince, LastTime=currentTime) URL_infor.append(myLinkUrl) else: for link in pageIndex['soup'].select("a"): # print(str(self.Url_inf.soup)) # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank"> # 南方科技大学等离子体技术基础仪器采购项目招标公告 # </a> # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank"> # 2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告 # print("children url is : "+ str(link)) try: href2 = link['href'] # 取出href对应的网站信息 具体信息如上 # print("href2: " + str(href2)) # 取出的信息包含情况如下3种 # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm # javascript:void(0) # # except KeyError: pageIndex['soup'].select("a").remove(link) else: # try正确运行 则运行else if ( href2.startswith("/") ): # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False # link2 = urljoin(self.Url_inf.Urlname, href2) # print(str(link2)) # link2=self.Url_inf.Urlname+href2 title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') elif (href2.startswith("../../..")): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('../../..',domain) elif href2.startswith(".."): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('..',domain) elif href2.startswith("./"): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('./',domain+'/') elif 'http' in href2 and 'gov' in href2: title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2 link2 = urljoin(pageIndex['Urlname'], href2) # print("link2 is :" + str(link2)) myLinkUrl = URLinformation(Urlname=link2, title=title, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname']) URL_infor.append(myLinkUrl) if USE_BXBLS is True: Links = list(set(URL_infor)) else: #TODO 出现AttributeError: 'NoneType' object has no attribute 'select' for http in pageIndex['soup'].select('option'): # 暂时未知有何内容 try: http2 = http['value'] # print("option" + str(http)) except KeyError: pageIndex['soup'].select("option").remove(http) else: if "gov" in http2 and 'http' in http2: myLinkUrl2 = URLinformation( Urlname=http2, title=http.text, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname']) URL_infor2.append(myLinkUrl2) Links = list(set(URL_infor + URL_infor2)) #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56 children urls1 Log.i("This url have " + str(len(Links)) + " children urls" + str(pageIndex['DeepNum'])) return Links
def parse_default(self): Log.i('run default parser ') pass
def downLoadHtml(self): """ 爬取并提取子链接 :param urlInfor: """ if self.ipProxy is None: self.ipProxy = self.getIpPoolMethod() if self.heads is None: self.heads = self.headersEngine.getHeaders() # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn', # 'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0, # 'content': None, # 'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=', # 'SleepTime': 0.0, 'FilePath': None} html = None #爬取网页所有内容 ctifety = 0 #解析子链接标志位 Flag = 1 #爬取完成标志位 count = 0 #空网页计算器 while (Flag): try: if count > 1: self.ipProxy = self.getIpPoolMethod() protocol = 'https' if 'https' in self.ipProxy else 'http' proxiesmmm = {protocol: self.ipProxy} req = requests.get(self.URL_inf.Urlname, headers=self.heads, allow_redirects=False, proxies=proxiesmmm, timeout=3) # 跳过验证反扒机制 soup_validate = BeautifulSoup(req.text, 'lxml') if soup_validate.find(name='title').string == '安全验证': self.ipProxy = self.getIpPoolMethod() continue if req.status_code != 200: self.ipProxy = self.getIpPoolMethod() continue reqheaders = req.headers if "application" in reqheaders["Content-Type"]: data = self.__downlowdFile(data=self.URL_inf, req=req) data['Download'] = 1 elif "text" in reqheaders["Content-Type"]: html = req.content self.URL_inf.Download = 0 ctifety = 1 Flag = 0 # 该回大部队了 else: continue except requests.exceptions.ConnectTimeout as e: Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() count += 1 if html is None: Flag = 1 except (ConnectionError, Timeout) as e: Flag = 1 count += 1 Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False count += 1 if html is None: Flag = 1 pass except Exception as e: Flag = 1 count += 1 #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language #TODO 处理这种httpconnectionpool max retries Failed to establish a new connection: Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e)) self.heads = self.headersEngine.getHeaders() #异常Max retries exceeded with url Error的处理 s = requests.session() s.keep_alive = False count += 1 if html is None: Flag = 1 pass if ctifety: self.URL_inf.content = html soup = BeautifulSoup(html, 'html.parser') #很棒棒的bs简单解析下 else: soup = None self.URL_inf.soup = soup Log.i(self.URL_inf.content.decode('utf-8')) return self.URL_inf