def __scheduleProjectCorpInfo__(self): try: conn = RedisClient() spider = ProjectCorpInfo() print('开始获取项目') key = 'TempProjectCorpInfoID' list_id = list(set(conn.all(rediskey=key))) spider.run(list_id, main_url=[PROJECTCORPINFO], key=key) except Exception as e: print("Error Spider project list", e)
def __scheduleProjectFinishInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司项目竣工信息 :param cycle: :return: """ try: conn = RedisClient() print('开始获取公司项目竣工信息') spider = ProjectFinishListSpider() key = 'TempProFinishListID' list_id = list(set(conn.all(rediskey=key))) spider.run(list_id, main_url=[FINISHMANAGE], key=key) except Exception as e: print("Error Spider project list", e)
def __scheduleBuildLicenceInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司项目施工许可信息 :param cycle: :return: """ try: conn = RedisClient() print('开始获取公司项目施工许可信息') spider = BuildLicenceListSpider() key = 'TempBuildLicenceListID' list_id = list(set(conn.all(rediskey=key))) spider.run(list_id, main_url=[LICENCEMANAGE], key=key) except Exception as e: print("Error Spider project list", e)
def __scheduleContractRecordInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司项目合同登记信息 :param cycle: :return: """ try: conn = RedisClient() print('开始获取公司项目合同登记信息') spider = ContractRecordListSpider() key = 'TempContractListID' list_id = list(set(conn.all(rediskey=key))) spider.run(list_id, main_url=[CONTRACTRECORD], key=key) except Exception as e: print("Error Spider project list", e)
def __scheduleTenderInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取项目招标信息 :param cycle: :return: """ try: conn = RedisClient() print('开始获取公司项目招标信息') spider = TenderListSpider() key = 'TempTenderListID' list_id = list(set(conn.all(rediskey=key))) spider.run(list_id, main_url=[TENDERAPI], key=key) except Exception as e: print("Error Spider project list", e)
def __scheduleProjectInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司项目信息 :param cycle: :return: """ try: conn = RedisClient() spider = ProjectInfoSpider() print('开始获取公司项目信息') print('当前以获取项目ID量为:', conn.count(rediskey='ProjectInfoID')) list_id = list(set(conn.all(rediskey='TempProjectListID')) - set(conn.all(rediskey='ProjectInfoID'))) spider.run(list_id) except Exception as e: print("Error Spider project list", e)
def __scheduleCompInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司信息 :param cycle: :return: """ try: conn = RedisClient() spider = CompanyInfoSpider() print('开始获取公司信息') print('当前以获取公司信息量为:', conn.count(rediskey='CompInfoID')) list_id = list(set(conn.all(rediskey='TempCompInfoID')) - set(conn.all(rediskey='CompInfoID'))) # new_list = self.__divList__(list_id=list_id) spider.run(list_id) except Exception as e: print("Error Spider comp info", e)
def __scheduleBuildLicencePerson__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司项目施工许可人员信息 :param cycle: :return: """ try: conn = RedisClient() spider = BuildLicencePersonSpider() print('开始获取公司项目施工许可人员信息') print('当前以获取项目施工许可人员ID量为:', conn.count(rediskey='BuildLicencePersonID')) list_id = list( set(conn.all(rediskey='BuildLicenceInfoID')) - set(conn.all(rediskey='BuildLicencePersonID'))) spider.run(list_id) except Exception as e: print("Error Spider project list", e)
def __scheduleWithinProjectList__(self, cycle=SPIDER_CYCLE, main_url=None, TempList=None): """ 获取项目内的各个类型list :param cycle: :param main_url: :param TempList: :return: """ try: conn = RedisClient() key = 'Temp' + TempList list_id = list(set(conn.all(rediskey=key))) spider = SaveWithinProjectSpider() spider.run(list_id, main_url, TempList) except Exception as e: print("Error Spider project list", e)
def __schedulePersonInfo__(self, cycle=SPIDER_CYCLE): """ 定时爬取公司人员信息 :param cycle: :return: """ try: conn = RedisClient() spider = PersonSpider() print('开始获取公司人员信息') print('当前以获取公司人员信息量为:', conn.count(rediskey='PersonInfoID')) list_id = list(set(conn.all(rediskey='CompInfoID')) - set(conn.all(rediskey='PersonInfoID'))) # new_list = self.__divList__(list_id=list_id) spider.run(list_id) time.sleep(cycle) except Exception as e: print("Error Spider Staff info", e)
def __init__(self): self._redis = RedisClient() self._mysql = MySQLClient() self._HEADERS = HEADERS self.ip = None self.port = None
class SpiderMain(object): def __init__(self): self._redis = RedisClient() self._mysql = MySQLClient() self._HEADERS = HEADERS self.ip = None self.port = None async def get_one_page(self, url): try: if self.ip is None: ip, port = self.__getProxy__() self.ip = ip self.port = port real_proxy = 'http://' + str(self.ip) + ":" + str(self.port) async with asyncio.Semaphore(MAX_ID): async with aiohttp.ClientSession( connector=aiohttp.TCPConnector(ssl=False)) as session: async with session.get(url, proxy=real_proxy, headers=self._HEADERS, timeout=15) as r: if r.status == 200 or r.status == 408: return await r.text() else: return await self.get_one_page(url) except Exception as e: print('请求异常: ' + str(e)) ip, port = self.__getProxy__() self.ip = ip self.port = port await self.get_one_page(url) # 并发爬取 async def main(self, urls, comp_id=None): try: # 任务列表 tasks = [self.get_one_page(url) for url in urls] # 并发执行并保存每一个任务的返回结果 results = await asyncio.gather(*tasks) # 返回解析为字典的数据 if len(results) > 0: if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \ '4f4019b64d9a050272bd5914634d030aab69' in results or results[0] is False: # 获取动态ip 传入 ip, port = self.__getProxy__() self.ip = ip self.port = port print("动态 ip 为" + str(ip) + ", 端口:" + str(port)) my_proxy = 'http://' + str(ip) + ":" + str(port) access_token = getToken(my_proxy) while access_token is None: access_token = getToken() self._HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accessToken': access_token } print(access_token) await self.main(urls, comp_id) # 保存数据 await self.__saveJsonData__(data=results, comp_id=comp_id) except Exception as e: print(e) def __getMaxPage__(self, url): try: if self.ip is None: ip, port = self.__getProxy__() self.ip = ip self.port = port proxyMeta = "http://%(host)s:%(port)s" % { "host": self.ip, "port": self.port, } proxies = { "http": proxyMeta, } response = requests.get(url, proxies=proxies, headers=self._HEADERS, verify=False, timeout=10) if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \ '4f4019b64d9a050272bd5914634d030aab69' in response.text: access_token = getToken() while access_token is None: access_token = getToken() self._HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accessToken': access_token } return self.__getMaxPage__(url) res = decrypts(response.text) res = str(res).replace( "'", "").split('success')[0] + 'success":true}' + "]" data_json = json.loads(res) if data_json[0]['code'] == 401: time.sleep(60) return self.__getMaxPage__(url) elif data_json[0]['code'] == 200: return data_json else: return self.__getMaxPage__(url) except Exception as e: print(e) ip, port = self.__getProxy__() self.ip = ip self.port = port return self.__getMaxPage__(url) def __getID__(self, rediskey=None): return self._redis.batch(rediskey=rediskey) def __findOneID__(self, idx=None, rediskey=None): return self._redis.exists(idx=idx, rediskey=rediskey) def __saveOneID__(self, idx=None, rediskey=None, score=None): if score is not None: self._redis.add(idx=idx, rediskey=rediskey, score=score) else: self._redis.add(idx=idx, rediskey=rediskey) def __saveOneID__(self, idx=None, rediskey=None, score=None): if score is not None: self._redis.add(idx=idx, rediskey=rediskey, score=score) else: self._redis.add(idx=idx, rediskey=rediskey) def __deleteID__(self, idx=None, rediskey=None): return self._redis.deletes(idx=idx, rediskey=rediskey) def __saveListID__(self, list_id, rediskey=None): for idx in list_id: self._redis.add(idx=idx, rediskey=rediskey) def __saveOneData__(self, table_name, data): print(data) return self._mysql.__insertData__(table_name=table_name, data=data) def __closeMysql__(self): try: self._mysql.__closeDB__() except Exception as e: print('Close Mysql failed!', e) def __asyncSpider__(self, list_id=None, comp_id=None): loop = asyncio.get_event_loop() loop.run_until_complete(self.main(list_id, comp_id)) """ 获取代理ip """ def __getProxy__(self): url = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=42599&port=1&ts=1&lb=1&pb=4®ions=' response = requests.get(url=url) json_str = json.loads(response.text) ip = json_str["data"][0]["ip"] port = json_str["data"][0]["port"] return (ip, port) def __getYunProxy__(self): url = 'http://gec.ip3366.net/api/?key=20191204153949621&getnum=1&anonymoustype=3&filter=1&area=1&order=2&formats=2' response = requests.get(url=url) json_str = json.loads(response.text) ip = json_str[0]["Ip"] port = json_str[0]["Port"] return (ip, port) def run(self, data_list): for data in data_list: self.__asyncSpider__(list_id=data) self.__closeMysql__()
def __init__(self): self._redis = RedisClient() self._mysql = MySQLClient() self._HEADERS = HEADERS
class SpiderMain(object): def __init__(self): self._redis = RedisClient() self._mysql = MySQLClient() self._HEADERS = HEADERS async def get_one_page(self, url): try: async with ClientSession() as session: async with session.get(url, headers=self._HEADERS) as r: # res = decrypts(r.text) return await r.text() except Exception as e: print('请求异常: ' + str(e)) return {} # 并发爬取 async def main(self, urls): # 任务列表 tasks = [self.get_one_page(url) for url in urls] # 并发执行并保存每一个任务的返回结果 results = await asyncio.gather(*tasks) # 返回解析为字典的数据 if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \ '4f4019b64d9a050272bd5914634d030aab69' in results: accessToken = getToken() self._HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accessToken': accessToken } print(accessToken) await self.main(urls) # 保存数据 self.__saveJsonData__(data=results) # for i in results: # print(decrypts(i)) def __getMaxPage__(self): pass def __getID__(self, rediskey=None): return self._redis.batch(rediskey=rediskey) def __findOneID__(self, idx=None, rediskey=None): return self._redis.exists(idx=idx, rediskey=rediskey) def __saveOneID__(self, idx=None, rediskey=None, score=None): if score is not None: self._redis.add(idx=idx, rediskey=rediskey, score=score) else: self._redis.add(idx=idx, rediskey=rediskey) def __saveone__(self, idx=None, rediskey=None): self._redis.add_one(idx=idx, rediskey=rediskey) def __deleteID__(self, idx=None, rediskey=None): return self._redis.deletes(idx=idx, rediskey=rediskey) def __saveListID__(self, list_id, rediskey=None): for idx in list_id: self._redis.add(idx=idx, rediskey=rediskey) def __saveOneData__(self, table_name, data): print(data) return self._mysql.__insertData__(table_name=table_name, data=data) def __closeMysql__(self): try: self._mysql.__closeDB__() except Exception as e: print('Close Mysql failed!', e) def run(self, data_list): datas = [data_list[x:x + 5] for x in range(0, len(data_list), 5)] for data in datas: self.__spiderInfo__(data=data)
def __scheduleCompListInfo__(self): try: conn = RedisClient() mysql = MySQLClient() sql = 'select * from companyName where flag is null order by id limit %d, %d' % (NUM, PRE) list_name = mysql.getAll(sql) for list_id in list_name: if conn.exists(idx=list_id[1], rediskey='CompName'): print(list_id[1], ' comp info is spiders') else: conn.delete_key(rediskey='CompName') conn.delete_key(rediskey='TempCompInfoID') conn.delete_key(rediskey='CompInfoID') conn.delete_key(rediskey='QualificationInfoID') conn.delete_key(rediskey='ProjectID') conn.delete_key(rediskey='TempProjectListID') conn.delete_key(rediskey='ProjectInfoID') conn.delete_key(rediskey='TenderInfoID') conn.delete_key(rediskey='TenderListID') conn.delete_key(rediskey='ContractListID') conn.delete_key(rediskey='ContractInfoID') conn.delete_key(rediskey='BuildLicenceInfoID') conn.delete_key(rediskey='BuildLicenceListID') conn.delete_key(rediskey='BuildLicencePersonID') conn.delete_key(rediskey='ProFinishListID') conn.delete_key(rediskey='ProjectFinishInfoID') conn.delete_key(rediskey='ProjectCorpInfoID') conn.delete_key(rediskey='TempTenderListID') conn.delete_key(rediskey='TempContractListID') conn.delete_key(rediskey='TempProFinishListID') conn.delete_key(rediskey='TempProCensorListID') conn.delete_key(rediskey='TempBuildLicenceListID') conn.delete_key(rediskey='TempProjectCorpInfoID') nowTime_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 获取当前时间 five_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 05:00:00" # 每天5点 six_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 06:00:00" # 每天6点 if nowTime_str < five_time or nowTime_str > six_time: spider = CompanyListSpider() spider.runs(list_id[1]) # time.sleep(cycle) self.__scheduleProjectList__() # self.__scheduleQualificationInfo__() self.__scheduleTenderInfo__() self.__scheduleContractRecordInfo__() self.__scheduleProjectCorpInfo__() self.__scheduleBuildLicenceInfo__() # self.__scheduleBuildLicencePerson__() self.__scheduleProjectFinishInfo__() update_sql = 'update companyName set flag = 1 where companyName = "%s" ' % list_id[1] end_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 07:00:00" # 每天7点 if nowTime_str > end_time or nowTime_str < six_time: mysql.__updateData__(update_sql) except Exception as e: print(e)