class GovBuy(object): '''广西公共资源交易信息网''' def __init__(self): name = 'guangxi_gxzbtb_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.cookies = { 'ASP.NET_SessionId': 'trbofu0uet0aywbdhr35s0x4', '__CSRFCOOKIE': '6f7e275f-5762-4569-8ea2-ae98d3b0379d', } self.headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.gxzbtb.cn', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.gxzbtb.cn/gxzbw/jyxx/001010/001010001/MoreInfo.aspx?CategoryNum=001010001', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='guangxi_gxzbtb_cn_list1', dbset='guangxi_gxzbtb_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: # selector_div = etree.HTML(str(div)) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) title = selector.xpath('//td[@id="tdTitle"]/font//text()') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath( '//td[@id="tdTitle"]/font[2]//text()') if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub( r'\/', '-', re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})', ''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date, title) # area_name = self.get_area('', title) area_name = '广西' # print(area_name) source = 'http://www.gxzbtb.cn/' table_ele = selector.xpath('//table[@id="tblInfo"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '广西壮族自治区公共资源交易中心' retult_dict['en_name'] = 'Guangxi Zhuang National Public Resources' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: params = (('CategoryNum', types), ) data = { '__CSRFTOKEN': '/ wEFJDZmN2UyNzVmLTU3NjItNDU2OS04ZWEyLWFlOThkM2IwMzc5ZA ==', '__VIEWSTATE': 'z6UhCTu3jqnsz47aHWA7gSWW/wG9YleyN9akSy8SDfrTIhkXv/8D27JYdKJG/ZWKPqb0smc7bS8/xKHcu0vIwbRrxU6DQIlkQJ3m97wtYMFtK8KpjZwIdMqSgnw1q3DjBu9vEraO4xfqzJacAXSCukutXa8BPCyLevI3U1naYHFUSSNqQhNo9cICs8Kqr8n8HNpvSRjqJB8CTAWoGlc8x6IeC/j50VdUETRudT9/w6Xc0To0rsY/yH+VbMRbAzIFlzNvQP/dmUPEdjMSRkvyULU6ZIgal19QWLJXJSGioQKJ3StzC9BqsgyaCobteQoKLs8/h20aMOCs2YO/oSpUVr0AaapTqrGIMhrM/IaPn8N00monNce5uU1fWffkBK6zL4mJijgPTmuyCrA3/AUY5La8VvP1v2NUScoKAdjRaPypTDlh1+ZUt3x7ZdwcmWd7bwgAK42uneSLZWSC7Er0k9VcuPZTR4E/I8fbEzJWf4Bf9pI2hs5actOcnD4ETfu5m/dgfq1mgz4uTrYIRcqE1xOmE3WDJNircnYO4xVTI49MDYfgDcGtjWRiBZHd788/5abVt7h9sXkzXPHboi7zVv4haS8yZsIEeQG3F8MFVtM7H9+/Kbss3uPC5I5V/dDw54S2zejVmbAx9dU68wJfL1+c28EbvOUOWgOE6dCuFmTS3nSiBLMjwUeRtKwhvP1RA5MsKD4WI8JtqR545DULfQz0XJCh1PlO+Nd5L661UuspC/PvgWRoNQEoaVpLJK1S9UuPAdNnAqXdMuJdZZPu60+Jdig7zOBSEGbmwNvmXB0vphagqkqxf1nruFS0OGP/D7OJcbcObzotOwp1GpMmBdqg6hiDh2nccyFZ+E4DUv5NssGK4Zj7dY2jhMBv8bvkIwaY+uLYMLJwJ0DqhNyP2JTKv/FmENt0pjfytL0EU2HRmLTcPgJWgdQ2IZ7GZCYhkSzBfkkJOmVx7t+UZervSy+hZBsVsaz0DpKZ9JMVXfOYVzQNZt+VmcmIn9feEEJH6F8t4gYlC1pvrfcFcBVI8ndumFTtsYjnHhD7gMG8q64rCOoD0DAB0si2OdOndIUczT0RlhLkpqa9RA1nQ9kj75RJGe/dm4NGfCqqUHcRJTNbixZUPdA2pJNzYhRAMLQBqGmFANV+tvqB2yDiJg86H6ouBO7v2+SXxkp70ZBxv8CiAHw2kKEWoxfqmnMi552GiJRSrpOhcw3ylrYA3dINyJjtDJ9ZNYxLGWD5Vzu170wePz/foMZ6o2+8zWgEBc3PDx2l3UTG1TUwt8dbQbokscyKFWtCoo/qs9kkZS1KYBZ2NKe8K3EswLU3d5pHJsUtFhe2QtHhMolkwM3LTzBIIBl1QyPS6HDnCasCFHNbGX2/k/TMPLEBrOpdsRo1YhhhNMYz8pwQbwTxo4WRsmmQBvPUkTSlk04Iasp9Tm8/2WO/NIFs0Y/h0BvdanXJHwP8blNEMZCd5qmP02LdYGygy6hs2uU79m+VM6wtHIbYxkC2AKuDkErEqaOuQCNNiQfyP6e0oaZwNcWQOOaQDWsz9/F12QCDwx3X2ihIaG2v1YCQHKC/lfnBQ8o9Q9LvrLuZ4yjVbaO3B1eM0Q46zuTbT9KbhLwkFm8LH/2JM/OOvvUDNN7A4z42REh3kZWC0eXOyCDD1HvNdOFxluC6GRtEdv6/31i/PKLPr7te5VCIGCFjF9oCxquF9+2ecYtlmqcvbEvCKnPx1JDO1BloI0i8TqwjDmqyaORTTYzrtJwnXjKiX+8TKDC9yyOMXA1qbTt/KZPTpHI1R0P9qQ7Pk8AwKeL1y5g68OMHUqOsSyJuo6SNety/xymBke5m1FG2zE9M7OOqSGtV6NCKXNoSgi3laUmbAMZ8x+GOQnXrfpMGvtPUPIUY9zEvUiEDeKKQSnqlxf5LwEWyWlEuzSjO4+8nIGVC9nUb/YEIM5o2wiC1lMXl2d2tDQ/Mau5M3B6qmSLx5QP3nfjDKsoFqN0tQxlo/TBCKLXAUOHM8zTrEBY/xkb6tvnijW+leCYPSKURnheByCjFWSPnlz6C8tiktma/JzVph5blcc3thpmHiGp87enSqKQkjIf8RJkVeM+ENhg0gAndrokhhPBiS+MTNCuX2zXimlH2dpTY7JKu4uSyltVswpG2mLWFGegTeKLsBjVks8je/eeJvAaevVRh9mNOD4Cj8jh6/6taR2ee0/EjYlDIkrCNstLNBcQ35u7NQOesHpN9j7Zxf1iSsz1ChY/fS3w+3AVg7hnZA6yr1pUa54NWQEakrAgjpNUTzdTkSfyLkGmdqSXZEady89XXYBKDfF6rkDa8hb19ujrWQZn9m9K22OeAw1k3w8wl29I2LMno86bezhCDhZRVa2RrsbsYAtJ+TMnEdWUuINhSrEbe9zRRga2N4BJv+eopnSClJYNNgkMiNVEOdWnfDVa9Wb9iqVRYfjBKfZRv8g4/tlMr6ygKYPBRprLiv1VQT9M+5hkhLWgtGeyOzTGxfiZG6QHnqSL2g/A+nu5Ij3fGoVDEPPj3Adcqk6AUrcY+XaJxR9wVVz927mFfFq5kxjo12Sw2ak1pS7faIy7o9Fk7Y9XKh1qu35ltABHEqiVIeb/dymZ7oLV+AClQeLbbmciJ7NKrdzTwRxanqOirpiPl5MnJtQxROEbt6lYeRG1RzEUsKMlp/L5v2aBRnkVWC4odd6FafVJw1NFDAhtVrI3uGta566tdsuT+FYaXOtELa/hUjBES+jWAJ0+qrDVff6ilka90N5wpQ55cjCwAs1VtaLa6b/zuin4h6+wfwtJnEGBfXND+1AQSbrveJHojhedFjPAYsSG988yhO0A1+TdQWGoJQmlEINiELipfNz/CUCbHENz431cxEjZV6No6qEXLUVXcbXp0BRB8sOZWtmbJ5LaLzS+unRSRN9RMk/80ct6AuINtSE2MCwrBpkrB3DhkebVRwWxxODsfGOj20j5pVpeI8jF75k/9igiTP/+3+N20FTsoJ/fVXevJ2YTUHIrJZc2j3bNDZ6LuHcJbEjS5DQat9WGeZa2FzDRba3ikBTxMevju8T9I2s19yFeztg72WQTcyDhN0I/TryQNcqZq67e8ScokSwQ1pE95EkIBdxk+7J9IIm1KHGp7P1T6PmxBqSyCyJT53AJgQxbhG2N+2NCpIk0ZfKA9Apvg/UfBFli/pa42N1XCdVnLwWW9wOY+vSbuo9Fnf91wTW1SrH1cZCrcWDFzJTlB703WUdA97ZyWuRMwypjXj5RGpTRi1R/maM3DwIcC6ktl+aczr8jK94UVPZ2iNVmgk/Ml92vly8vycYSTkHvFCHmw0gzSyhBjaCDSEL80nw4T4XjrrNfohWQRYDnk+isTfbfmpt6KRz8yIczndwTZdSN5rYigqeAJMd9DAxm28DcGCUk1nOyeASMtByfmPDd/jp6ihDR8Uj10eaty7X0LyjvB3Ol4kjvNucSPwJhwe6PCULDCMKKM9EQFTs0UiiyAhA/1N52njX2EpWDLOnT8yfMMDfDOwdwex/3DVo22nYjzTArBjbjJ4N6RtPW0rrWXJNJFHpm6ZSUTFZXgtZw+wAvBxRWiuXsvQqUYS4a25rN1/8aIaKxV9rxhSTZzF7l9K5S0wvjF1+kwarDs/M5SQT8pZtdEnySC5tgn057VgiCpEHbCWYm16zWPv7ARLsRV8D21nmMoYAJqJ5jZZMcrVTMuutYG7zc7W2rmjt2Nto/enbDGWgBeyMCsCPPA6+VYvOXWV6JTCwwCUQ//+LH4z1Kokk02ObYuNfwh0x4ilnU6JYM9t65ExOl7shHpKQUHrXwtwDi49hZNTD78s3yPOJYa5E9delhUSFFCAqH5/AxgSFKMOJXyBgsQlntLLWlYGCUabX61ClQuf3flIQ80RBZKlwA6qTpW3dS4EcgCP4beaujMVq/ifreAkY3hGwZwbdXViux7rLJTdj188Bim8KVbCYfIwWWoin8Nsi/rZiPorqikSMdyEw9VoWtIMz6/PNeJY5mh68hzeCGFKEIRNDPy+wMlMbh1Q1vzj1RTQa7sMAaDrq99gx3oc+CXHZKpbVwPOk/HwjJ6JM90TNrZdBIL0+PW98LgriR5FuqoUFp4DUHMSW0YjZDqj+MUq9OMFhOCFUTzg53NkBlgvKdzzr8Afve7xL9pXCcvXdRPxCHW78Hj1cJn/zmOe19RissiNTqUS5ArxaCeiD3IEmVKJboz2B2E7kp+mwpjCvx0IJ9HUUGJiBeP2ayo9SGOxZPfKVZ3hLV5Yrk2kyOagPI9ZA7kNzCRQO0+cgObPKve9kqANbcB7CxIWP7yVTTMGN2hHwzK731hA4nU7VXT2af6fO1/A42/DHaqmLqgBNBij6ihMW+xtOUmfJ9Fft/+9fTMps9rvznPluGxp4LwmLiugk9OEg+5qzJMzpec/zYFU0L3GWPiMJpcrBgO4uZ9Sl+beLk11GzbrFgcL+3Uhb7dzgxZvAaE4kHPbx2W4VDJGCuXdiTTlPZFwV2KTE2k7U37bP7IvgRDSu18ZjXqS0ckwDqd/jbXwmc84FLEo73rs9D050kmeYREx9c/GJHs6bR5bTIKkrCorEXJ+I1LNItiyYpgQ0fCsutxe91UwVLh4IV1l+jmjQoOeY99vzYmqJ/mv1FbWuqTSFZzHOIJmxpY3hSHGsnjh3fTlCwp2vb7OI2OcS4hdPfm/wUwiMoO4o0+MEEIZq4s2/243WkXxnQv4x8eGJkbBvhlhKgNOoNxwB5wgAAnhXkH3PH08VS1skVudmUwMNChQMwKnQr44CUMhYsmy3PXftAeLMBvTjSAngfdupJU6mV6hQHcioY+uk3cq0AfBtDdRKa/ANMFXNFt45zbANxG2wtfbaGLKmSETIPxshs5KupcFM+E0ikl+/iO8sLV7tbIqPmgzKG4kuovGfVw/Io8Z+ol113M9419oCHr8M9LZcqOw1HbQcCC2hDQCyW9aiCEryPZyUN0c84vCukRQACb0YeTBu8Hl693+QJd0KVAJ8c05wTRa0xBjdsTdZ2jVGdSez42wtoI/ZaMsjcOFKrjaeMuzH5ZWNJROiawSaucbQfRtrfvIXBDaOacqMEIFp3qU9wlzUYAAJhhHp0I2DeM4moOILIdIS0hflR4p2MLF8VR9TO6sy3qaQ+omHxh4mWqVin/PqYKElWtTbxMOCM5U5sxJHVw+MsnD9lcqpWRyunuYDGtMdDLOXHUxRsoqk7O0X3gB2Pta+ffxXL5yNMsQAMBqzvlO/x6N6gWQxkySjqMwrj+oeKs/uVWuSbxvnsGAkR1k4XobilSn8pN4Tws3cnNH848CYCoLrOEIXGQOFfm5IqLBami3ECbfrxZOnlctJ2O2FMMtM4oKK889EbGznvm76A2lOEmgIMhPDFsNwca6AJRIP+AbZVafFTK/pjG/DQR+Onj5x1ArfG7xkX1GcgsKqlPk1XC+SyBa1Q0/BE+lvrYD7/ozLSA9t87Gsm0/+fFpWr7+Dx7dKA1qQfhE5TU+uhAn5iz6m/4mcH1JTKhW2EZVdLI34Fg8MVPBHDoGwcnYGw54D9UT1dHjUdYKXDmkECVg9t/fGLNAryddSE8gwBmGQPQBCg8ACFDG1Vz7pz4DwtIHtc+vs8Q0tjuCRut2S7fexj9jEXaUHUaUiY9yMHL6g/3X9/7WsxsQ9BVauhusCPC4WjsKkFny/W7felQcWbX9OJ/73kRA78BuG1yWPN1xEkZFe9IWQhMCCOKZ+xXJs7IBi4bsctunx/TyWznFXi5mUtVyLgEG1JAG/7MvLXxyJrg2RhViCrMv/zWdjxuaGL1oPA2JINl9QnSsWFMYJwsUFy93HIP2KIILJzam7R0Q23+Xj0ioiO9tFl5PGAlLJEMhRVREnayraf5PKcmAYJsJNguyoTJhfyFCsC0kA74a9S7YwXiBnr7SLHNuVvBACVyvcSqGsVb/hXDDwzdW+UTXiklYnH5U7POZNSkXq539j+FG71Ndxsxz906PmTb/ZU6d3X1Zlm583SRB8VzYf5qCXrHJCK7d98zytr9XKoUH1rIItoqalLp67udBMEOqRrdiG5GYV/P117dunqKt8cVryDjUuiFfkNNRSSBknnFEVuIXdeFOo/tgfX6AqU5sDmjajo88fRSOnnDkAK0YazroIwporIjp8QxTCv+HLLpt1FsQWnxI7gc1hNaUnzCkTuoTTwLzIAKzJ5iWfgJvu2voRLFZ7crFe8gJ5eCZ3x3O6uvxvkhit0XYFsuPL2A7b4agWGb+fXNbdccCoKVo1mZjI5EX6medskd6mcEEKscxBWb8skl4azvlcA8v4l58nkVF3P6puR3nR+nMlT+igLAEttSfIO4aKH2ry5R4D14InwrKbURhOZOiLmilVjqtTZJ1gI/pLg9F7d4FLpG0qINV+srl1aC56zfI4MkXjroArUE85yO/XmgrqWHS+PIFlUZyAEk7tb0HcK5J/vAt0MGEsya6QGk2+6nBi2QDEdcykbe9GKcJSv4JKlzjngqjh5yjz1PY2Ui4QsuAQYfVpLOJXFtsVyXxl9OnWNAkIcyRdjR+UyJUqeMrJmvCGZvyDkr0heFp+W0XN1aW6fOlB4wURO6wvvmT+f2cOR8e8oRW3UdURs+UPBVQSkaU8fHDFecHkfvruVuN0JhKFDGijcGQEicA2sSHfgSrzv38aOwCUmxsPaSIdLqYlz+Q+GzPkMFQpkQt43C1yLaEh8FSOkixOV/P2Y3q5PsII3yfgdHf6aTGAy3OPK7eWc4Yo/avmsj21hPcJDoJk3iMYGQE/kGwueljbGLkESjROGcbJOe7qwavRbM5Ok+TgKmR1kEeKJ7rU3UWh9Ttz+oBd+SZUXzbphYUvPLH1GLR0J8qW41Yv6WmL7Zg5XMYw6OmCWInmkSCQPoTUEhrkagnscZ7OFpdls0QE7tFTHKmzXU66cAD86BZofRkBTdYI0bk61VLr6hXV81YSBQTBVZu8FkAYYfI40l7FHDi/3fNQQ6vGGlSCz5ULlF4QEeBA5rzPBkzpcK22e+bl6YBOnnpx3N7edak3Auc96oGVFabec8QM3CUI4G98rt3A/OGQw9iu6P8WFfbuBQnCtva4pFCrJorA/6QXda247/pRL7ov5lMMc1qqLrYzxLgTUoYs0CCgIosEhucryWseQ9c5KzY+r0pChkUkKhkmXUxMqO6+5pFZ/ef1Oy4KXQYUMR+RU/obNSHyyB+L70Sw/xJCeGy9d9bCjMmkDL0t9elhkn0unvzObirMrHPh4h4FXYx6rxyfqdcz8w7KsElalaFk4PIQKupZTp+UayvTCKNPwLuaEXQr5tXccra5niBnN+TAWRzWKXefACVlF1xiVE3mhbH/M6gdTYp/Pj6fxWoP7pQG5lolcJsn84BG8yt2DYJUknDNBw992dolm7mpFWDbFySsKcyZfXTl9qxNUTG8ge19reYz+pNZANlWEQf2tG+StIiFZVZkj/X9DQECCuvK1aCPfb7jop14pPtOC9iNIjBG2/MvwoiqsDLz0IZMMA+Yz//STFJBO/mDzll0Js+znxQTl2VOuTxOpZ4SQvPnp2jPxVW/+EaA1PCQhOvy0x2kkH1K+KPsIJkQvLG7XbS0C+qOqvmccjBRN0iwf0DD+tqjYVUZ/EkLd7vtQEKL00HMKkdErClQxRPD/1bTe1aw3OUfegjlohma7sjZCQPrD/7Z81oVOZfLBxTM9kYwx5DdvZP8K2g/v3qjtEac4oT71W/a3yLRGllWEuKf6d08Yq2LrR5jcNy22U0B9R0exyFKegatzOOCoyxzQ4/GRGNuRXdvdnzwZqUCxY4war/yVplduX9R8pq+wZZLvFF9T1AN13JSKbB82LG/D7dMgZpw+Av8ur98jpUn8RoTPWaLAyEVFaYPSy5QT6vDHtXFXD8PVi2ET3uWpKCrVPRiy6sYGHB75XzN2MvXsqvRr7voBo4Rl4TXbZaznSxwxYLzHmIM8XzLekBxOGg+p6ROERQ0Bw0MYscv5TDPunfts+tIU2ykVfyfkt+4wyzX32uOseAi7rn40pXw2fixSAc8lBe3h7myKkGvkn2EkxmKsvs+6ML3TeoTherBgPi+8V3cCgIakdNXCyq7Dm9HeZ4yJEmgWaAHkLZq6C4ZmrJ2ZVXVFc8zxGao/IHFQCrsNMXa4WcnDdLKl/88v9A+W4nLQmDIcvU+rfQKGhBp2XbnEWrzewVw9d8ysuyeqiJyjvjjBIbLK+AZapva7xG74cN9FNuGWOdt7+pxiZes94+9ERUbT/Sxhdca+sGV1E9ueSv8Bw4FZ0l7qFOs2AyO65DUTekPwM3H84MMyRDXrVi733KMjduEnhjQtfoEYidQBuvpOUm5opb7xiVGgtDqtYU/P2D4Ztf1x+n/r7aZqytfI+8CJwKh9qhhgT5NKH4Bp/AuJVJqHsZIdUUNxrUhCprv8RU74Q1y3DimHkHr+yqr3LU7flZ1MnZQF+VZ51PgQTfhrGgsLCs73jPMgv9jLsRpNxs5K7EIThZUiiDMgdP4jicfrsI7e0XT9D9Nmpvwj2flU2pBkGNO9v+1YYpK2hb71KAxj9kE8KrKshiJHv9WU1RqRmWmIfvIvi+BjfaIMeywTCFcMKWFPret7zY8ZhJqvaowFoCyhNiLYWFjvKqeTZbeJti/O7AKjavWn8fa5LoHqGiIQeGjp5izIEbD79R7CaNNmE0suHKFFSjOqU1yrQQ8saoMkT3wHspM4A42gOD/HGFu0fNm/RGNZpBAqHmwOJ/6AhkKU2scSX0QXZrRXhLjABqvBo0z/OcCJSgTx3aUECmBfggWoSJcFUnfREqLlaecxEfme1ZKGkpNvJBwnNCscy9FQfQsgw0ryS1AzcUyX/VNgW/3ny4edpDK8dcVVmXJhft4c1yH+QLA1be0clmpLf64M8t4pkD0LGSXSNL7UqVHfkSiyaSWttwjdGmELYSQohx4nWEsPUO+tze1TYeBlsgVdH6UctVzFTuop5jLUVR3oHOBScFashAOHcDalzVcbzpJ0vn2n7YeCN51+5gPhgzWqd4bP1xNo0Gr0VTCVWpAqoTlRj6HekLhriSxZ6peiDarmlLp40AYGgViIf2Zh7uWa71YEDHlx+MT/VIQxPrSnnbCHGT+a2hzy54AoynEdL73gkCj8JgKce73cjtoPaFfRG+FLk/R+07JI+Al8cV07RwQysZSFZS1nkAvntcMgtBJLudhlc3IMP9k3l6EOxZqHJyZGoOwTfCMjLg2P7Z7SScEpykE+lzWFU0W6O+4OGpE8K/zYbErAdtF7JCLJLVA9AvKCyeDmQGdRGbhcPhnskFIGpL9RNHKZHAh8eTKQdE/Dk/K3+0wPGXD45iCk6lbgu9S5x0uE/kcVWDb3TfrvvoqycGwdnxALI7/lFVlb0sxrDrnNOuEG0canG+RKOKIPJZYa5FyVu1tXpr+2kYvvcsIwxVVzTl7/jOo4Fnmb8b/7QxXfZ3UVVLj+8N+P6M0qUCsiaE3pGnGy1HxEwfC7nIJfki3+tBIBa0hDnw1cxfQi32uvdlqyeV0/VX1O2tg1Dj3ihbMrG2KQ+YTKNjinDUA3QmF3K1ipKk2+xoilF8vuCQaEJVJIaDOOIwX4x+/4d0n1Q68MaLgw5mxK3dv1A8hr60kZ8fYPVvNkMCBeLo7cMVs4swMuVSjM6CxrQsBId/+JktBo7RHJakj7aeWdZ+g8ITxx54oNQutt9+h2QTKGpSDyU0j6kF61rn5M+H3MB1ZN8dLE22fcXjzHFGAKvJzJM/7w5LDQ/Oh/Q0Z66oDeacr+NtAjsok7FIn91NLerbGoy4rjKNc83qyoKwdDmhWrokeneCS5kqgTG2b17cGb1ynyBNKBFTqDtbnkFTK68vsJtP0hzN7hXfINKOTGUEPdKTE5WPyt2ZrYOoz0VnA01EewJa7gXUec6x/kBp7X9240r01ywuRGrw5l+JXtiUmYjOteA7iSqWwbwqnFWwAlBgfXIvw9hcrVN3eXxX4K3fIucHS2ibM3KE/e1VviGkCU8/K2OMzQHOuiQiix9UPiwF4oUGRalHxBchirfetT5yIhGsdLah1X3CyDEaxUA6Cos6rV4gB6ouVnVVw8pqJVY5JX+rYc161tRLFVmtrQZhbstM9Gbc5dJpHJl6xql//rGdgAcEnv5jm2xe2BHY4Wn5y4P5PGeuNe1fKBcnLlgpjK4dHMB0NUfLH4o2E767ZXB8rfndv19ZMfhMIU2E2x+A0MZNhKhy2mefFaj+wQ0OVddKhEoXYMETtGaP0pn2jfwd/r8jBwn62zgNRmZFfhJ4OqbYydTuuhuZQyfZpLlF3uWxE8tqNWWzLRRWTZVwBAzKexPEuzsVIiKkrXX94kYze8kt8KcoDkN19jrSVyomHZMBk94OnNouLVONkXWkDxvDIbVMvXSJs+uqk7228DmhZplBwNSpaVg19q9Ny0vCvio98Dh78Pqi12XYaKRohe7RuJbzUrwunTW4hsV5xAreCy9n2DtRKWWI1v7rw4L/750nS1LtOJXUDbG0FLCpRyHmVhckad+YXGK/V6QtTtVDOp+DqH/7mlgeTkOjzuXej5i03PaZhez4eXw6Cozt0BqmbbaOK7aBv1GdZTWWVlQ7A7fnGGCxFoElmuksWIKIzhwqf89a0Lnk0TjF37f55mvnr5F3XVW4TlVUhsKhsHIANNqb/xKBFdxWSjMJg12V/5DeItXrcpr3pI8KJayTCpBOqbzcfhk+fMjMmDY6/+f1E+nMpqRYfzecMDYwHwpPV0F9DT5xzddj/vFPQMWgR46dz/jVaakX804jDbJ3xCVBGa4EpCLR3Br8Tmi7lA8RKoRgxEayH4PYHpI++Zi+VdU9X5R0ANvWmFqtzzv2XuCg4dPwIwFAfmeisnvis81lF4xei5s7bTlubyuMo13VKRbMAYj92exfPxrwl5N+9qbnmIzidl7/mmGq5pNHJ6zUOXizulKFbnpJw2S65Aun2jmaWdQinTF7Nv+Jxcd+4GSkkUPcQNhIwoE7rIF2PaLBSPFwEYkro/FnxsWElzk8z1ReQikPzMGh4+GnW2dzU0qF+G4X0CNiVewq1of+B6jQotyvLXtmsinINsLZ+EtE1J7ld4El1EMvTPD4hyVHmU5TMlKq320KlRFE9h33vszSAjEmhnM695IoF9R8jlHQ7uDJ7n05l1da3nugwlRewsC5sQtuOQ2+DQq2MKwGKDe/FckChLyWE04XHP+pDmSnNzjzjScWJswnucFfv+ThapwkyJHzGIU6kFd1RXXSnusEkker69Er4NvK4MIYUIqUBXBBIKdOCD/90q8FB/22tu7JITuKl6c3vPlcSI5zUNdClEl99ccvLc2nY9ggGVe028=', '__VIEWSTATEGENERATOR': '16D6DBB1', '__EVENTTARGET': 'MoreInfoList1$Pager', '__EVENTARGUMENT': page, '__VIEWSTATEENCRYPTED': '', } url = 'http://www.gxzbtb.cn/gxzbw/jyxx/{}/MoreInfo.aspx'.format( categoryId) response = requests.post( url=url, headers=self.headers, params=params, data=data, cookies=self.cookies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') url_li = selector.xpath( '//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href') # for div_ele in div_ele_li: for url in url_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') urls = 'http://www.gxzbtb.cn' + url # self.load_get_html(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() flag = 1 task_li = [ { 'categoryId': '001010/001010001', 'types': '001010001', 'all_page': flag }, { 'categoryId': '001010/001010002', 'types': '001010002', 'all_page': flag }, { 'categoryId': '001010/001010004', 'types': '001010004', 'all_page': flag }, { 'categoryId': '001001/001001001', 'types': '001001001', 'all_page': flag }, { 'categoryId': '001001/001001002', 'types': '001001002', 'all_page': flag }, { 'categoryId': '001001/001001004', 'types': '001001004', 'all_page': flag }, { 'categoryId': '001001/001001005', 'types': '001001005', 'all_page': flag }, { 'categoryId': '001004/001004001', 'types': '001004001', 'all_page': flag }, { 'categoryId': '001004/001004002', 'types': '001004002', 'all_page': flag }, { 'categoryId': '001004/001004004', 'types': '001004004', 'all_page': flag }, { 'categoryId': '001004/001004005', 'types': '001004005', 'all_page': flag }, { 'categoryId': '001007/001007001', 'types': '001007001', 'all_page': flag }, { 'categoryId': '001011/001011001', 'types': '001011001', 'all_page': flag }, { 'categoryId': '001011/001011002', 'types': '001011002', 'all_page': flag }, { 'categoryId': '001012/001012001', 'types': '001012001', 'all_page': flag }, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''深圳政府采购网''' def __init__(self): name = 'shenzhen_zfcg_sz_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', 'Referer': 'http://61.144.227.212/was5/web/search?page=4096^&channelid=261279^&orderby=-DOCRELTIME^&perpage=10^&outlinepage=5^&searchscope=^×cope=^×copecolumn=^&orderby=-DOCRELTIME^&chnlid=^&andsen=^&total=^&orsen=^&exclude=', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'Origin': 'http://61.144.227.212', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='shenzhen_list1', dbset='shenzhen_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def load_get_html(self, url): try: # print(url) response = requests.get( url=url, headers=self.headers).content.decode('gb2312') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:'.format(e)) else: title = selector.xpath( '//*[@id="content"]/div/div[2]/div/h4/text()') if title != []: title = title[0] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath( '//*[@id="content"]/div/div[2]/div/h6/label//text()') if publish_date != []: publish_date = re.search(r'(\d+\-\d+\-\d+)', ''.join(publish_date)).group() else: publish_date = None soup = BeautifulSoup(response) content_html = soup.find(class_='main') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['publish_date'] = publish_date retult_dict['source'] = 'http://www.zfcg.sz.gov.cn/' retult_dict['area_name'] = '深圳' retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '深圳市政府采购监管网 ' retult_dict['en_name'] = 'Shenzhen Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = ( ('page', str(page)), ('channelid', '261279'), ('orderby', ['-DOCRELTIME', '-DOCRELTIME']), ('perpage', '10'), ('outlinepage', '5'), ('searchscope', ''), ('timescope', ''), ('timescopecolumn', ''), ('chnlid', ''), ('andsen', ''), ('total', ''), ('orsen', ''), ('exclude', ''), ) data = [ ('showother', 'false'), ('showtype', 'txt'), ('classnum', '20'), ('classcol', 'CTYPE'), ('channelid', '261279'), ('orderby', '-DOCRELTIME'), ] url = 'http://61.144.227.212/was5/web/search' response = self.session.post(url=url, headers=self.headers, params=params, data=data).content.decode('utf-8') selector = etree.HTML(response) url_li = selector.xpath('//div[@class="r_list"]/dl/dd/a/@href') print('第{}页'.format(page)) except: print('load_post error') else: for url in url_li: # print(url) if not self.rq.in_rset(url): self.rq.add_to_rset(url) self.rq.pull_to_rlist(url) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: # self.load_get_html(self.rq.get_to_rlist()) spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() task_li = [ # {'all_page': 43879}, { 'all_page': 5 }, ] count = 3 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: spawns = [ gevent.spawn(self.load_get, page + i) for i in range(count) ] gevent.joinall(spawns) except: pass if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''陕西公共资源交易信息网''' def __init__(self): name = 'shaanxi_sxggzyjy_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.sxggzyjy.cn/jydt/001001/subPage_jyxx.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='shaanxip_list1', dbset='shaanxip_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) print(url) # self.load_get_html(url) else: title = selector.xpath('//h3[@class="article-title"]/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="info-source"]//text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date) area_name = self.get_area('陕西', title) # area_name = '四川-成都' # print(area_name) source = 'http://www.sxggzyjy.cn/' table_ele = selector.xpath('//div[@class="ewb-main"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '陕西省公共资源交易中心' retult_dict['en_name'] = 'Shaanxi Province Public resource' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: url = 'http://www.sxggzyjy.cn/jydt/001001/{}.html'.format(page) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) url_li = selector.xpath('//ul[@class="ewb-list"]/li/a/@href') for url in url_li: urls = 'http://www.sxggzyjy.cn' + url # self.load_get_html(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() task_li = [ # {'categoryId':'', 'types':'','all_page': 1845}, { 'categoryId': '', 'types': '', 'all_page': 2 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''湖南公共资源交易信息网''' def __init__(self): name = 'hunan_hncg_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.rq = Rdis_Queue(host='localhost', dblist='hunan_hncg_gov_cn_list1', dbset='hunan_hncg_gov_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, data_dic): if data_dic == None: return try: # selector_div = etree.HTML(str(div)) url = 'http://www.hncg.gov.cn/portal/protalAction!viewNoticeContent.action?noticeId={}'.format( data_dic['NOTICE_ID']) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) # title = selector.xpath('//div[@class="title"]/h2/text()') # if title != []: # title = re.sub(r'\r|\n|\s','',''.join(title)) # try: # status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() # except: # status = '公告' # else: # title = None # status = '公告' title = data_dic['NOTICE_TITLE'] status = data_dic['PRCM_MODE_NAME'] _id = self.hash_to_md5(url) publish_date = data_dic['NEWWORK_DATE'] # publish_date = selector.xpath('//div[@class="title"]/h3//text()') # if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() # # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # # if '-' not in publish_date: # # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) # else: # publish_date = None print(publish_date, title) area_name = '湖南' # area_name = '浙江-杭州' # print(area_name) source = 'http://www.hncg.gov.cn/' # print(url) # print(response) table_ele = selector.xpath('//html') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '湖南省公共资源交易中心' retult_dict['en_name'] = 'Hunan Province Public resource' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: params = { 'title': '', 'origin': '', 'inDates': '1', 'channelId': '845', 'ext': '', 'beginTime': '', 'endTime': '', } url = 'http://www.hnsggzy.com/queryContent_{}-jygk.jspx'.format( page) response = requests.get(url=url, headers=self.headers, params=params).text print(response) # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') # url_li = selector.xpath('//ul[@id="list_ul"]/li/a/@href') # for div_ele in div_ele_li: # for url in url_li: response_li = response for data_dic in response_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') print(data_dic) # self.load_get_html(data_dic) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '', 'types': '', 'all_page': 2 }, # {'categoryId':'', 'types':'','all_page': 1000}, ] count = 2 for task in task_li: for page in range(0, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''山西采购电子商城''' def __init__(self): name = 'shanxi_sxzfcg_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.sxzfcg.cn/view.php?nav=61', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='shanxi_sxzfcg_cn_list1', dbset='shanxi_sxzfcg_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: response = requests.get( url=url, headers=self.headers).content.decode('utf-8') print(url) selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) title = selector.xpath('//div[@valign="middle"]/h2/text()') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath('//td[@bgcolor="#E6E6E6"]//text()') if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub( r'年|月', '-', re.search(r'(\d{8}|\d{4}年\d+月\d{1,2})', ''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date, title) area_name = '山西' # print(area_name) source = 'http://www.sxzfcg.cn/' # print(url) # print(response) table_ele = selector.xpath('//td[@class="c_pt"]/table/tr[2]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '山西省省级政府采购中心' retult_dict['en_name'] = 'Shanxi Government Procurement Center' print(publish_date) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: params = ( ('nav', types), ('page', page), ) url = 'http://www.sxzfcg.cn/view.php' response = requests.get(url=url, headers=self.headers, params=params).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr') url_li = selector.xpath('//tr[@class="odd"]/td/a/@href') # for div_ele in div_ele_li: for url in url_li: urls = 'http://www.sxzfcg.cn/{}'.format(url) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() flag = 1 task_li = [ { 'categoryId': '', 'types': '61', 'all_page': flag }, { 'categoryId': '', 'types': '62', 'all_page': flag }, { 'categoryId': '', 'types': '63', 'all_page': flag }, { 'categoryId': '', 'types': '64', 'all_page': flag }, { 'categoryId': '', 'types': '65', 'all_page': flag }, { 'categoryId': '', 'types': '66', 'all_page': flag }, { 'categoryId': '', 'types': '67', 'all_page': flag }, { 'categoryId': '', 'types': '68', 'all_page': flag }, { 'categoryId': '', 'types': '69', 'all_page': flag }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 10: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''银川公共资源交易信息网''' def __init__(self): name = 'yinchuan_ycsggzy_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://www.ycsggzy.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'http://www.ycsggzy.cn/morelink.html?type=12^&index=0', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='yinchuan_ycsggzy_cn_list1', dbset='yinchuan_ycsggzy_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, key): if key == None: return try: url = 'http://www.ycsggzy.cn/Ajax/article.ashx' data = { 'czlx': 'article', 'cxcs': '12|0|{}'.format(key), } response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: title = selector.xpath('//p[@class="a_title"]/text()') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url + key) publish_date = selector.xpath('//p[@class="box_p"]//text()') # print(publish_date) if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub( r'\/', '-', re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})', ''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date) # area_name = self.get_area('山东',title) area_name = '宁夏-银川' # print(area_name) source = 'http://www.ycsggzy.cn/' table_ele = selector.xpath('//ul') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '银川公共资源交易中心' retult_dict['en_name'] = 'Yinchuan City Public resource' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: data = [ ('czlx', 'linetxt'), ('cxcs', '{}|{}|{}|20'.format(categoryId, types, page)), ] url = 'http://www.ycsggzy.cn/Ajax/morelink.ashx' response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8') # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) url_li = re.findall(r'key=(.*?)\"\>', response) for key in url_li: # li = etree.tostring(li_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # urls = 'http://www.ycsggzy.cn/' + url self.load_get_html(key) # # if not self.rq.in_rset(key): # self.rq.add_to_rset(key) # self.rq.pull_to_rlist(key) def init(self): count = 3 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '12', 'types': '0', 'all_page': 2 }, { 'categoryId': '12', 'types': '2', 'all_page': 1 }, { 'categoryId': '17', 'types': '0', 'all_page': 2 }, { 'categoryId': '17', 'types': '1', 'all_page': 2 }, { 'categoryId': '17', 'types': '2', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''苏州公共资源交易信息网''' def __init__(self): name = 'suzhou_szzyjy_fwzx_suzhou_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-DevTools-Emulate-Network-Conditions-Client-Id': '06AB3D9C05E9FDAB1EDDAD36BA60296F', 'Referer': 'http://ggzy.hefei.gov.cn/jyxx/002001/002001001/3.html', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='suzhou_szzyjy_fwzx_suzhou_gov_cn_list1', dbset='suzhou_szzyjy_fwzx_suzhou_gov_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: # selector_div = etree.HTML(str(div)) response = requests.get(url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: title = selector.xpath('//h2[@class="word-title"]/text()') if title != []: title = re.sub(r'\r|\n|\s','',''.join(title)) try: status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//h4[@class="word-info"]//text()') if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None print(publish_date, title) # area_name = self.get_area() area_name = '江苏-苏州' source = 'http://szzyjy.fwzx.suzhou.gov.cn' table_ele = selector.xpath('//div[@class="border"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '苏州市公共资源交易中心' retult_dict['en_name'] = 'Suzhou City Public resource' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self,categoryId, types, page): try: params = ( ('paging', page), ) url = 'http://szzyjy.fwzx.suzhou.gov.cn/Front/jyzx/{}/'.format(types) response = requests.get(url=url, headers=self.headers, params=params).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') try: url_li = selector.xpath('//*[@class="mr-content"]/div[1]/table/tr/td[1]/a/@href') except: time.sleep(3) self.load_get(categoryId, types, page) # for div_ele in div_ele_li: for url in url_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') urls = 'http://szzyjy.fwzx.suzhou.gov.cn' + url # self.load_get_html(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() task_li = [ {'categoryId':'', 'types':'002004/002004001','all_page': 2}, {'categoryId':'', 'types':'002004/002004002','all_page': 2}, {'categoryId':'', 'types':'002004/002004003','all_page': 2}, {'categoryId':'', 'types':'002004/002004004','all_page': 1}, {'categoryId':'', 'types':'002004/002004005','all_page': 2}, {'categoryId':'', 'types':'002004/002004006','all_page': 1}, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 10: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''贵阳政府采购网''' def __init__(self): name = 'guiyang_gygp_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.gygp.gov.cn/list-37-1.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='guiyang_list1', dbset='guiyang_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): # print(url) if url == None: return try: response = requests.get( url=url, headers=self.headers).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) self.load_get_html(url) else: title = selector.xpath('//div[@class="biaoti"]/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="fbsj"]/span/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = '贵州-贵阳' # print(area_name) source = 'http://www.gygp.gov.cn/' table_ele = selector.xpath('//div[@class="content_box"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '贵阳市政府采购网' retult_dict['en_name'] = 'Guiyang City Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, types, page): try: url = 'http://www.gygp.gov.cn/' + types + str(page) + '.html' response = requests.get(url=url, headers=self.headers).text selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(types, page) else: print('第{}页'.format(page)) url_li = selector.xpath( '//div[@class="right_top_content"]/ul/li/span/a/@href') for url in url_li: urls = 'http://gyzfcg.gyggzy.cn' + url self.load_get_html(urls) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ { 'type': 'list-12-', 'all_page': 3 }, { 'type': 'list-13-', 'all_page': 3 }, { 'type': 'list-27-', 'all_page': 2 }, { 'type': 'list-36-', 'all_page': 1 }, { 'type': 'list-28-', 'all_page': 1 }, { 'type': 'list-37-', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: types = task['type'] # self.load_get(base_url, page) spawns = [ gevent.spawn(self.load_get, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''黑龙江政府采购网''' def __init__(self): name = 'heilongjiang_hljcg_gov_cn' self.coll = StorageSetting(name) self.headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.hljcg.gov.cn', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.hljcg.gov.cn/xwzs^!queryXwxxqx.action', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.rq = Rdis_Queue(host='localhost', dblist='heilongjiang_list1', dbset='heilongjiang_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: response = requests.get( url=url, headers=self.headers, allow_redirects=False).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) self.load_get_html(url) else: # print(response) title = selector.xpath('//div[@class="mtt"]/p[1]/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="mtt"]/p[2]/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() else: publish_date = None area_name = self.get_area('黑龙江', title) source = 'http://www.hljcg.gov.cn/' table_ele = selector.xpath('//div[@class="xxej"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '黑龙江省政府采购网' retult_dict['en_name'] = 'Heilongjiang Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, lbbh, page): try: data = [ ('xwzsPage.pageNo', page), ('xwzsPage.pageSize', '20'), ('xwzsPage.pageCount', '1293'), ('lbbh', lbbh), ('xwzsPage.LBBH', lbbh), ('xwzsPage.zlbh', ''), ('xwzsPage.GJZ', ''), ] url = 'http://www.hljcg.gov.cn/xwzs!queryXwxxqx.action' response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(lbbh, page) else: print('第{}页'.format(page)) url_li = selector.xpath( '//div[@class="yahoo"]/div/span[1]/a/@onclick') for url in url_li: urls = re.findall(r"href='(.*?)'", url)[0] urls = 'http://www.hljcg.gov.cn' + urls self.load_get_html(urls) # if not self.rq.in_rset(pid): # self.rq.add_to_rset(pid) # self.rq.pull_to_rlist(pid) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ #{'lbbh':'4','all_page': 1293}, #{'lbbh':'30','all_page': 26}, #{'lbbh':'99','all_page': 112}, #{'lbbh':'98','all_page': 18}, #{'lbbh':'5','all_page': 668}, { 'lbbh': '4', 'all_page': 2 }, { 'lbbh': '30', 'all_page': 1 }, { 'lbbh': '99', 'all_page': 1 }, { 'lbbh': '98', 'all_page': 1 }, { 'lbbh': '5', 'all_page': 2 }, ] count = 3 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: lbbh = task['lbbh'] spawns = [ gevent.spawn(self.load_get, lbbh, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''广东采购电子商城''' def __init__(self): name = 'guangdong_gpcgd_com' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.gpcgd.com', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.gpcgd.com/gpcgd/portal/portal-news^!list', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='guangdong_gpcgd_com_list1', dbset='guangdong_gpcgd_com_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, pid): if pid == None: return try: url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!detailNews?portalNews.id={}'.format(pid) response = requests.get(url=url, headers=self.headers).content.decode('utf-8') print(url) selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) title = selector.xpath('//div[@class="pub_title"]/h1/text()') if title != []: title = re.sub(r'\r|\n|\s','',''.join(title)) try: status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="pub_note"]//text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None print(publish_date, title) area_name = '广东' # print(area_name) source = 'http://www.gpcgd.com/' # print(url) # print(response) table_ele = selector.xpath('//div[@class="pub_cont_details"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '广东省政府采购中心' retult_dict['en_name'] = 'Guangdong Government Procurement Center' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self,categoryId, types, page): try: data = [ ('portalNews.typeId', types), ('pageNum', page), ] url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!list' response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr') url_li = re.findall(r'onclick\=\"detailNews\(\'(.*?)\'\)\"',response) # for div_ele in div_ele_li: for pid in url_li: # for data_dic in response_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # urls = 'http://www.jngp.gov.cn{}'.format(url) # print(data_dic) # self.load_get_html(pid) if not self.rq.in_rset(pid): self.rq.add_to_rset(pid) self.rq.pull_to_rlist(pid) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() task_li = [ {'categoryId':'', 'types':'90011','all_page': 1}, {'categoryId':'', 'types':'90013','all_page': 1}, {'categoryId':'', 'types':'40011','all_page': 2}, {'categoryId':'', 'types':'40012','all_page': 2}, {'categoryId':'', 'types':'40013','all_page': 1}, {'categoryId':'', 'types':'40014','all_page': 1}, {'categoryId':'', 'types':'40015','all_page': 1}, {'categoryId':'', 'types':'40016','all_page': 1}, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 10: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''南昌政府采购网''' def __init__(self): name = 'nanchang_ncszfcg_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.ncszfcg.gov.cn/index2018.cfm', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='nanchang_list1', dbset='nanchang_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self,li): if li == None: return try: selector_li = etree.HTML(str(li)) url = 'http://www.ncszfcg.gov.cn/'+ selector_li.xpath('//li/a/@href')[0] print(url) response = requests.get(url=url, headers=self.headers).text selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector_li.xpath('//li/a/@title') if title != []: title = title[0] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector_li.xpath('//li/div/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group() else: publish_date = None print(publish_date,title) area_name = '江西-南昌' source = 'http://www.ncszfcg.gov.cn/' table = selector.xpath('//div[@class="ewb-detail-box"]')[0] content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '南昌市政府采购网' retult_dict['en_name'] = 'Nanchang Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = { 'sid': '100002', 'Page': page, } url = 'http://www.ncszfcg.gov.cn/more2018.cfm' response = requests.get(url=url, headers=self.headers,params=params).text selector = etree.HTML(response) except: print('load_post error') self.load_get(page) else: print('第{}页'.format(page)) ul_li_ele = selector.xpath('//ul[@class="listbox"]/li') for ul_li in ul_li_ele: li = etree.tostring(ul_li, pretty_print=True,encoding='utf-8',method='html').decode('utf-8') self.load_get_html(li) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ # {'all_page': 909}, {'all_page': 3}, ] count = 3 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # self.load_get(page) spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''江苏政府采购网''' def __init__(self): name = 'jiangsu_ccgp-jiangsu_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/index_1.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='jiangsu_list1', dbset='jiangsu_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector.xpath('//div[@class="dtit"]/h1/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="detail_bz"]/span/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)', ''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = self.get_area('江苏', title) # print(area_name) source = 'http://www.ccgp-jiangsu.gov.cn/' table = selector.xpath('//div[@class="detail"]') if table != []: table = table[0] else: return content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # print(content_html) retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '江苏政府采购网' retult_dict['en_name'] = 'Jiangsu Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, base_url, page): try: if page == 0: url = base_url else: url = base_url + 'index_' + str(page) + '.html' response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except: print('load_post error') # self.load_get(url) else: # print('第{}页'.format(page)) url_li = selector.xpath('//div[@class="list_list"]/ul/li/a/@href') if url_li == []: url_li = selector.xpath( '//div[@class="list_list02"]/ul/li/a/@href') for url in url_li: urls = base_url + url.replace('./', '') # print(urls) # self.load_get_html((urls)) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() flag = 2 task_li = [ { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cgyg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/htgg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/xqyj/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/ysgg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/shengji/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nanjing/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/wuxi/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/changzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/zhenjiang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nantong/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yangzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yancheng/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/huaian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suqian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/lianyungang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/xuzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/shengji/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nanjing/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/wuxi/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/changzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/zhenjiang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nantong/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/taizhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yangzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yancheng/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/huaian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suqian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/lianyungang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/xuzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/shengji/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nanjing/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/wuxi/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/changzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/zhenjiang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nantong/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/taizhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yangzhou/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yancheng/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/huaian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suqian/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/lianyungang/', 'all_page': flag }, { 'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/xuzhou/', 'all_page': flag }, ] count = 3 for task in task_li: for page in range(0, task['all_page'] + 1, count): try: base_url = task['url'] # self.load_get(base_url, page) spawns = [ gevent.spawn(self.load_get, base_url, page + i) for i in range(count) ] gevent.joinall(spawns) print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''长春-政府采购网''' def __init__(self): name = 'changchun_cczfcg_gov_cn' self.coll = StorageSetting(name) self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', # 'Referer': 'http://www.cczfcg.gov.cn/article/bid_list.action?__fp=vKUU60vQmvMBON82huO8GA^%^3D^%^3D^&field=2^&title=^&d-16544-p=3^&getList=^&getList=^%^E6^%^90^%^9C^%^E7^%^B4^%^A2^&_sourcePage=QGlMpvcgcewgbrz1QGkYn6WfINWh0k0sL4lzLkek3lM^%^3D^&type=2', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'X-Requested-With': 'ShockwaveFlash/30.0.0.134', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='changchun_list1', dbset='changchun_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self,url): if url == None: return try: response = self.session.get(url=url, headers=self.headers).text selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: # print(response) title = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/center/span/text()') if title == []: title = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/table[1]/caption/text()') if title != []: title = title[0] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' else: try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' title = title[0] # print(title) # print(url) _id = self.hash_to_md5(title) publish_date = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/p[2]/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group() else: publish_date = None soup = BeautifulSoup(response) content_html = soup.find(class_='details') area_name = self.get_area('长春', title) source = 'http://www.cczfcg.gov.cn' retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '长春市政府采购网' retult_dict['en_name'] = 'Changchun City Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, params): try: url = 'http://www.cczfcg.gov.cn/article/bid_list.action' response = self.session.get(url=url, headers=self.headers, params=params).text selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) else: url_li = selector.xpath('//*[@id="row"]/tbody/tr/td/a/@href') for url in url_li: urls = 'http://www.cczfcg.gov.cn' + url # print(urls) # print(urls) self.load_get_html(urls) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 1 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ #{'type':1,'field':1,'all_page': 35}, #{'type':1,'field':2,'all_page': 129}, #{'type':2,'field':1,'all_page': 32}, #{'type':2,'field':2,'all_page': 130}, {'type':1,'field':1,'all_page': 1}, {'type':1,'field':2,'all_page': 1}, {'type':2,'field':1,'all_page': 1}, {'type':2,'field':2,'all_page': 1}, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): params = { 'field': task['field'], 'title':'', 'd-16544-p': str(page), 'getList': '搜索', 'type': task['type'], '__fp': 'V7VgOK3HYWUBON82huO8GA ==', '_sourcePage': '1dxhayx - Cv4gbrz1QGkYn6WfINWh0k0sL4lzLkek3lM =', } try: self.load_get(params) # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)] # gevent.joinall(spawns) except Exception as e: print(e) print('第{}页'.format(page)) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''杭州公共资源交易信息网''' def __init__(self): name = 'hangzhou_hzctc_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://www.hzctc.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://www.hzctc.cn/SecondPage/ProjectAfficheList?area=^&afficheType=22^&proID=^&title=', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='hangzhou_hzctc_cn_list1', dbset='hangzhou_hzctc_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, data_dic): if data_dic == None: return try: # selector_div = etree.HTML(str(div)) try: url = 'http://www.hzctc.cn/AfficheShow/Home?AfficheID={}&IsInner={}&ModuleID=22'.format( data_dic['ID'], data_dic['IsInner']) except: url = 'http://www.hzctc.cn/OpenBidRecord/Index?id={}&tenderID={}&ModuleID=22'.format( data_dic['ID'], data_dic['TenderID']) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) # title = selector.xpath('//div[@class="Content-Main FloatL"]/span/text()') title = [data_dic['TenderName']] if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) # publish_date = selector.xpath('//div[@class="Content-Main FloatL"]/em//text()') publish_date = [data_dic['PublishStartTime']] if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None print(publish_date, title) # area_name = self.get_area('江西', title) area_name = '浙江-杭州' # print(area_name) source = 'http://www.hzctc.cn' table_ele = selector.xpath('//div[@class="MainList"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '杭州市公共资源交易网' retult_dict['en_name'] = 'Hangzhou Public resource' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: data = [ ('area', ''), ('afficheType', categoryId), ('IsToday', ''), ('title', ''), ('proID', ''), ('number', ''), ('_search', 'false'), ('nd', str(int(time.time() * 1000))), ('rows', '10'), ('page', page), ('sidx', 'PublishStartTime'), ('sord', 'desc'), ] url = 'http://www.hzctc.cn/SecondPage/GetNotice' response = requests.post(url=url, headers=self.headers, data=data).json() # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') # url_li = selector.xpath('//div[@class="List-Li FloatL"]/ul/li/a/@href') # for div_ele in div_ele_li: # for url in url_li: response_li = response['rows'] for data_dic in response_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # urls = 'http://ggzy.wzzbtb.com:6081' + url # print(data_dic) self.load_get_html(data_dic) # if not self.rq.in_rset(pid): # self.rq.add_to_rset(pid) # self.rq.pull_to_rlist(pid) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '22', 'types': 'jyxx', 'all_page': 2 }, { 'categoryId': '27', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '23', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '465', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '24', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '486', 'types': 'jyxx', 'all_page': 2 }, { 'categoryId': '25', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '28', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '26', 'types': 'jyxx', 'all_page': 1 }, { 'categoryId': '32', 'types': 'jyxx', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''海南政府采购网''' def __init__(self): name = 'hainan_ccgp-hainan_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.ccgp-hainan.gov.cn/thirdparty/My97DatePicker/My97DatePicker.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'X-DevTools-Emulate-Network-Conditions-Client-Id': 'EAC4BA3425D26FC6B117994EFF4DEC28', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='hainan_list1', dbset='hainan_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def load_get_html(self, url): try: # print(url) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:'.format(e)) else: title = selector.xpath('//div[@class="nei03_02"]/div[1]/text()') if title != []: title = title[0] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="nei03_02"]/div[2]//text()') if publish_date != []: publish_date = re.search(r'(\d+\-\d+\-\d+)', ''.join(publish_date)).group() else: publish_date = None soup = BeautifulSoup(response) content_html = soup.find(class_='nei03_02') source = 'http://www.ccgp-hainan.gov.cn/' area_name = self.get_area('海南', title) retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['publish_date'] = publish_date retult_dict['source'] = source retult_dict['area_name'] = area_name retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '中国海南政府采购网 ' retult_dict['en_name'] = 'Hainan Province Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, params): try: url = 'http://www.ccgp-hainan.gov.cn/cgw/cgw_list.jsp' response = self.session.get(url=url, headers=self.headers, params=params).content.decode('utf-8') selector = etree.HTML(response) url_li = selector.xpath( '//div[@class="nei02_04_01"]/ul/li/em/a/@href') except: print('load_post error') else: for url in url_li: url = 'http://www.ccgp-hainan.gov.cn' + url if not self.rq.in_rset(url): self.rq.add_to_rset(url) self.rq.pull_to_rlist(url) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: # self.load_get_html(self.rq.get_to_rlist()) spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() task_li = [ # {'all_page': 2521}, { 'all_page': 5 }, ] for task in task_li: for page in range(1, task['all_page'] + 1): params = ( ('currentPage', str(page)), ('begindate', ''), ('enddate', ''), ('title', ''), ('bid_type', ''), ('proj_number', ''), ('zone', ''), ) self.load_get(params) print('第{}页'.format(page)) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''福建政府采购网''' def __init__(self,source,base_url, all_page): name = 'fujian_cz_fjzfcg_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://cz.fjzfcg.gov.cn/3500/noticelist/d03180adb4de41acbb063875889f9af1/?page=1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() self.source = source self.base_url = base_url self._all_page = all_page self.rq = Rdis_Queue(host='localhost', dblist='fujian_list1', dbset='fujian_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self,tr): if tr == None: return try: selector_tr = etree.HTML(str(tr)) url = self.source + selector_tr.xpath('//tr/td[4]/a/@href')[0] # print(url) response = requests.get(url=url, headers=self.headers).text selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector_tr.xpath('//tr/td[4]/a/text()') if title != []: title = title[0] # try: # status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() # except: # status = '公告' else: title = None # status = '公告' status = selector_tr.xpath('//tr/td[2]/text()') if status != []: status = status[0] else: status =None # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector_tr.xpath('//tr/td[5]/text()') if publish_date != []: publish_date = publish_date[0] # publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group() else: publish_date = None # print(publish_date) aaa = selector_tr.xpath('//tr/td[1]/text()') if aaa != []: aaa = aaa[0] else: aaa = '福建' area_name = self.get_area('福建',aaa ) print(area_name) source = self.source table = selector.xpath('//*[@id="print-content"]')[0] content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = 'http://117.27.88.250:9306/' retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '福建省政府采购网' retult_dict['en_name'] = 'Fujian Province Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = { 'page':str(page), } url = self.base_url + 'noticelist/d03180adb4de41acbb063875889f9af1/' print(url) response = requests.get(url=url, headers=self.headers,params=params).text selector = etree.HTML(response) except: print('load_post error') # self.load_get(page) else: print('第{}页'.format(page)) tr_ele_li = selector.xpath('//div[@class="wrapTable"]/table/tbody/tr') for tr_ele in tr_ele_li: tr = etree.tostring(tr_ele, pretty_print=True,encoding='utf-8',method='html').decode('utf-8') self.load_get_html(tr) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ # {'all_page': 9111}, {'all_page': self._all_page}, ] count = 4 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # self.load_get(page) spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''青島政府采购网''' def __init__(self): name = 'qingdao_ccgp-qingdao_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'https://www.ccgp-qingdao.gov.cn', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'text/plain', 'Accept': '*/*', 'Referer': 'https://www.ccgp-qingdao.gov.cn/sdgp2014/site/channelall370200.jsp?colcode=0401^&flag=0401', 'Connection': 'keep-alive', } self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='qingdao_list1', dbset='qingdao_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, ids): if ids == None: return try: url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/site/read370200.jsp?id=' + str( ids) # print(url) response = requests.get(url=url, headers=self.headers, verify=False).content.decode("gb18030") selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # self.load_get_html(li) else: title = selector.xpath('//div[@class="biaot"]/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="biaotq"]/text()') if publish_date != []: publish_date = re.sub( r'年|月', '-', re.search(r'(\d{4}年\d+月\d{1,2})', ''.join(publish_date)).group()) else: publish_date = None # print(publish_date) area_name = '山东-青島' source = 'https://www.ccgp-qingdao.gov.cn/' table_ele = selector.xpath('//div[@class="cont"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '青岛市政府采购网' retult_dict['en_name'] = 'Qingdao City Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, types, page): try: # url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/dwr/call/plaincall/dwrmng.queryWithoutUi.dwr' url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/dwr/call/plaincall/dwrmng.queryWithoutUi.dwr' data = { 'callCount': '1', 'windowName': '', 'c0-scriptName': 'dwrmng', 'c0-methodName': 'queryWithoutUi', 'c0-id': '0', 'c0-param0': 'number:7', 'c0-e1': 'string:' + types, 'c0-e2': 'string:' + str(page), 'c0-e3': 'number:10', 'c0-e4': 'string:', 'c0-e5': 'null:null', 'c0-param1': 'Object_Object:{_COLCODE:reference:c0-e1, _INDEX:reference:c0-e2, _PAGESIZE:reference:c0-e3, _REGION:reference:c0-e4, _KEYWORD:reference:c0-e5}', 'batchId': '8', 'page': '%2Fsdgp2014%2Fsite%2Fchannelall370200.jsp%3Fcolcode%3D0401%26flag%3D0401', 'httpSessionId': '', 'scriptSessionId': '9BCA99F81A827529F202FF26A81421A0', } response = requests.post(url=url, headers=self.headers, data=data, verify=False).text a = re.findall(r'rsltStringValue:"(.*?)"', response)[0] except Exception as e: print('load_get error:{}'.format(e)) # self.load_get(types,page) else: print('第{}页'.format(page)) b = a.split('?') for i in b: ids = i.split(',')[0] self.load_get_html(ids) def init(self): count = 1 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ { 'types': '0401', 'all_page': 3 }, { 'types': '0402', 'all_page': 3 }, { 'types': '0403', 'all_page': 2 }, { 'types': '0404', 'all_page': 2 }, { 'types': '0405', 'all_page': 2 }, { 'types': '0406', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: types = task['types'] self.load_get(types, page) spawns = [ gevent.spawn(self.load_get, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''重庆政府采购网''' def __init__(self): name = 'chongqing_cqgp_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Referer': 'https://www.cqgp.gov.cn/notices/list?source=41,42^&area=^%^E9^%^87^%^8D^%^E5^%^BA^%^86^%^E5^%^B8^%^82^&purches=^%^E9^%^87^%^87^%^E8^%^B4^%^AD^%^E5^%^85^%^AC^%^E5^%^91^%^8A', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='chongqing_list1', dbset='chongqing_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, pid): if pid == None: return try: proxies = proxy_pool.proxies() url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format( pid) response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).json() # selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) self.load_get_html(pid) else: title = response['notice']['title'] try: status = response['notice']['projectPurchaseWayName'] except: status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()') publish_date = response['notice']['issueTime'] if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', publish_date).group() else: publish_date = None # print(publish_date) area_name = '重庆' # print(area_name) source = 'https://www.cqgp.gov.cn/' content_html = response['notice']['html'] retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '重庆市政府采购网' retult_dict['en_name'] = 'Chongqing City Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = ( ('pi', page), ('ps', '20'), ('timestamp', str(int(time.time() * 1000))), ) proxies = proxy_pool.proxies() url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable' response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=5).json() # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(page) else: print('第{}页'.format(page)) response_li = response['notices'] for data_dict in response_li: pid = data_dict['id'] # print(pid) # self.load_get_html(pid) # time.sleep(2) if not self.rq.in_rset(pid): self.rq.add_to_rset(pid) self.rq.pull_to_rlist(pid) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() task_li = [ # {'all_page': 18647}, { 'all_page': 3 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # self.load_get(types, page) spawns = [ gevent.spawn(self.load_get, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''浙江公共资源交易信息网''' def __init__(self): name = 'zhejiang_zjpubservice_zjzwfw_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://www.zjpubservice.com/002/infogov.html', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue( host='localhost', dblist='zhejiang_zjpubservice_zjzwfw_gov_cn_list1', dbset='zhejiang_zjpubservice_zjzwfw_gov_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, data_dic): if data_dic == None: return try: # selector_div = etree.HTML(str(div)) urls = data_dic['link'] url_li = re.findall( r'infoid\=(.*?)\&categorynum\=(.*?)\&infodate\=(.*)', urls)[0] url = 'http://zjpubservice.zjzwfw.gov.cn/{}/{}/{}/{}/{}.html'.format( url_li[1][:3], url_li[1][:6], url_li[1], url_li[2], url_li[0]) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) # title = selector.xpath('//div[@class="Content-Main FloatL"]/span/text()') title = [data_dic['title']] if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) # publish_date = selector.xpath('//div[@class="Content-Main FloatL"]/em//text()') publish_date = [data_dic['date']] if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None print(publish_date, title) area_name = self.get_area('浙江', data_dic['remark5']) # area_name = '浙江-杭州' # print(area_name) source = 'http://zjpubservice.zjzwfw.gov.cn/' # print(url) # print(response) table_ele = selector.xpath('//div[@class="article_bd"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '浙江省公共资源交易服务平台' retult_dict['en_name'] = 'Zhejiang Public resource' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: params = ( ('format', 'json'), ('sort', '0'), ('rmk1', types), ('pn', page), ('rn', '20'), ('idx_cgy', 'web'), ) url = 'http://www.zjpubservice.com/fulltextsearch/rest/getfulltextdata' response = requests.get(url=url, headers=self.headers, params=params).json() print(response) return # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') # url_li = selector.xpath('//div[@class="List-Li FloatL"]/ul/li/a/@href') # for div_ele in div_ele_li: # for url in url_li: response_li = response['result']['records'] for data_dic in response_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # urls = 'http://ggzy.wzzbtb.com:6081' + url # print(data_dic) self.load_get_html(data_dic) # if not self.rq.in_rset(pid): # self.rq.add_to_rset(pid) # self.rq.pull_to_rlist(pid) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '', 'types': '002001001', 'all_page': 3 }, { 'categoryId': '', 'types': '002001002', 'all_page': 1 }, { 'categoryId': '', 'types': '002001003', 'all_page': 3 }, { 'categoryId': '', 'types': '002001004', 'all_page': 3 }, { 'categoryId': '', 'types': '002001005', 'all_page': 3 }, { 'categoryId': '', 'types': '002002001', 'all_page': 3 }, { 'categoryId': '', 'types': '002002002', 'all_page': 3 }, { 'categoryId': '', 'types': '002003001', 'all_page': 1 }, { 'categoryId': '', 'types': '002003002', 'all_page': 1 }, { 'categoryId': '', 'types': '002004001', 'all_page': 2 }, { 'categoryId': '', 'types': '002004002', 'all_page': 1 }, { 'categoryId': '', 'types': '002005001', 'all_page': 3 }, { 'categoryId': '', 'types': '002005002', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(0, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''湖南政府采购网''' def __init__(self): name = 'hunan_ccgp-hunan_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://www.ccgp-hunan.gov.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://www.ccgp-hunan.gov.cn/page/notice/more.jsp?noticeTypeID=prcmNotices', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='hunan_list1', dbset='hunan_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, data_dict): try: url = 'http://www.ccgp-hunan.gov.cn/mvc/viewNoticeContent.do?noticeId=' + str( data_dict['NOTICE_ID']) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = data_dict['NOTICE_TITLE'] # print(title) status = data_dict['NOTICE_NAME'] # print(status) _id = self.hash_to_md5(url) publish_date = data_dict['NEWWORK_DATE'] # print(publish_date) # area_name = self.get_area('武汉', ''.join(publish_date_li)) area_name = '湖南' source = 'http://www.ccgp-hunan.gov.cn/' soup = BeautifulSoup(response) content_html = soup.find('table') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '湖南政府采购网' retult_dict['en_name'] = 'Hunan Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: data = [ ('pType', ''), ('prcmPrjName', ''), ('prcmItemCode', ''), ('prcmOrgName', ''), ('startDate', '2019-01-17'), ('endDate', '2019-12-31'), ('prcmPlanNo', ''), ('page', page), ('pageSize', '18'), ] url = 'http://www.ccgp-hunan.gov.cn/mvc/getNoticeList4Web.do' response = requests.post(url=url, headers=self.headers, data=data).json() except: print('load_post error') self.load_get(page) else: print('第{}页'.format(page)) response_li = response['rows'] # print(response_li) for data_dict in response_li: print(data_dict) self.load_get_html(data_dict) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ # {'all_page': 1000}, { 'all_page': 3 }, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # url =task['url']+str(page)+'.html' # self.load_get(page) spawns = [ gevent.spawn(self.load_get, page + i) for i in range(count) ] gevent.joinall(spawns) print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''浙江政府采购网''' def __init__(self): name = 'zhejiang_manager_zjzfcg_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://www.zjzfcg.gov.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://www.zjzfcg.gov.cn/purchaseNotice/index.html?categoryId=10', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='zhejinag_list1', dbset='zhejiang_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, data_dict): try: proxies = proxy_pool.proxies() params = { 'noticeId': data_dict['id'], 'url': 'http://notice.zcy.gov.cn/new/noticeDetail', } url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults' response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = data_dict['title'] # print(title) status = data_dict['typeName'] # print(status) _id = self.hash_to_md5(response.url) publish_date = time.strftime( "%Y-%m-%d", time.localtime(int(data_dict['pubDate']) / 1000)) # print(publish_date) area_name = data_dict['districtName'] # print(area_name) source = 'http://www.zjzfcg.gov.cn/' try: content_html = response.json()['noticeContent'] except: return retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '浙江政府采购网' retult_dict['en_name'] = 'Zhejiang government Procurement' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = { 'pageSize': '15', 'pageNo': page, 'url': 'http://notice.zcy.gov.cn/new/noticeSearch', 'noticeType': '0', } url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults' proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=5).json() except Exception as e: print('load_post error{}'.format(e)) self.load_get(page) else: print('第{}页'.format(page)) response_li = response['articles'] # print(response_li) for data_dict in response_li: self.load_get_html(data_dict) # self.load_get_html(data_dict) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ { 'all_page': 3 }, # {'all_page': 2000}, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # url =task['url']+str(page)+'.html' # self.load_get(page) spawns = [ gevent.spawn(self.load_get, page + i) for i in range(count) ] gevent.joinall(spawns) print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''济南政府采购网''' def __init__(self): name = 'jinan_jncz_jinan_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Proxy-Connection': 'keep-alive', 'Proxy-Authorization': 'Basic MTYzOTY2MzE2ODphamxhNTJ0bQ==', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Referer': 'http://119.164.253.173:8080/jngp2016/site/list.jsp?curpage=3&colid=121', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='jinan_list1', dbset='jinan_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, tr): if tr == None: return try: selector_li = etree.HTML(str(tr)) tr_li = selector_li.xpath('//tr/td[2]/a/@href')[0] url = 'http://119.164.253.173:8080' + tr_li proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # self.load_get_html(li) else: title = selector_li.xpath('//tr/td[2]/a/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector_li.xpath('//tr/td/text()') if publish_date != []: publish_date = re.sub( r'\[|\]', '-', re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group()) else: publish_date = None # print(publish_date) area_name = '山东-济南' source = 'http://jncz.jinan.gov.cn/' try: table_ele = selector.xpath('//body/table') except: return if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '济南市财政局' retult_dict['en_name'] = 'Jinan Finance Bureau' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, colid, page): try: params = ( ('curpage', page), ('colid', colid), ) url = 'http://119.164.253.173:8080/jngp2016/site/list.jsp' proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=10).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(colid, page) else: print('第{}页'.format(page)) try: li_ele_li = selector.xpath('//table[@class="list"]/tr') except: return for li_ele in li_ele_li: tr = etree.tostring(li_ele, pretty_print=True, encoding='utf-8', method='html').decode('utf-8') # print(li) self.load_get_html(tr) def run(self): task_li = [ { 'colid': '37', 'all_page': 3 }, { 'colid': '38', 'all_page': 3 }, { 'colid': '81', 'all_page': 3 }, { 'colid': '29', 'all_page': 3 }, { 'colid': '101', 'all_page': 3 }, { 'colid': '122', 'all_page': 3 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: colid = task['colid'] self.load_get(colid, page) # spawns = [gevent.spawn(self.load_get,colid, page + i) for i in range(count)] # gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''海口政府采购网''' def __init__(self): name = 'haikou_ggzy_haikou_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://ggzy.haikou.gov.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://ggzy.haikou.gov.cn/login.do?method=newsecond^¶m=431241696e6465783d3326747970653d5a435f4a59', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='haikou_list1', dbset='haikou_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): try: if url == None: return response = requests.get(url=url, headers=self.headers).text selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector.xpath('//div[@class="part_1"]/div[1]/text()') if title != []: title = title[0] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="part_1"]/div[2]//text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)', ''.join(publish_date)).group() else: publish_date = None # print(publish_date) soup = BeautifulSoup(response) content_html = soup.find(class_='content_wrap') area_name = self.get_area('海口', title) source = 'http://ggzy.haikou.gov.cn' retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '海口公共资源交易网' retult_dict['en_name'] = 'Hiakou Public resource' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, data): try: params = (('method', 'getSecondTableInfo'), ) url = 'http://ggzy.haikou.gov.cn/login.do' response = requests.post(url=url, headers=self.headers, params=params, data=data).json() except: print('load_post error') else: response_li = response['result'] for dic in response_li: key_str = 'flag=3&name=' + dic['FLAG'] + '&key=' + dic['KEYID'] es = EncodeStr(key_str) encodestr = es.encodes() urls = 'http://ggzy.haikou.gov.cn/login.do?method=newDetail¶m=' + encodestr # print(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() flag = 3 task_li = [ { 'type': 'GC_JY', 'all_page': flag }, { 'type': 'GC_GS', 'all_page': flag }, { 'type': 'GC_JG', 'all_page': flag }, { 'type': 'ZC_JY', 'all_page': flag }, { 'type': 'ZC_JG', 'all_page': flag }, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): data = [ ('currentPage', str(page)), ('pageSize', '20'), ('flag', '3'), ('type', task['type']), ('notice_title', ''), ] try: self.load_get(data) print('第{}页'.format(page)) # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)] # gevent.joinall(spawns) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''贵阳公共资源交易信息网''' def __init__(self): name = 'guiyang_gcjs_gyggzy_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.gcjs.gyggzy.cn/noticeconstruct/index.htm', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='guiyang_gcjs_gyggzy_cn_list1', dbset='guiyang_gcjs_gyggzy_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, div): if div == None: return try: selector_div = etree.HTML(str(div)) url = selector_div.xpath('//div/div/a/@href')[0] response = requests.get( url=url, headers=self.headers).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: title = selector_div.xpath('//div/div/a/@title') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector_div.xpath('//div/div[2]/text()') # print(publish_date) if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date, title) # area_name = self.get_area('山东',title) area_name = '贵州-贵阳' # print(area_name) source = 'http://www.gcjs.gyggzy.cn/' table_ele = selector.xpath('//div[@class="text_c"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '贵阳市公共资源交易监管网' retult_dict['en_name'] = 'Guiyang City Public resource' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: url = 'http://www.gcjs.gyggzy.cn/{}/index_{}.htm'.format( types, page) response = requests.get( url=url, headers=self.headers).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) div_ele_li = selector.xpath('//div[@class="c1-bline"]') for div_ele in div_ele_li: div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') # urls = 'http://www.nxzfcg.gov.cn' + url self.load_get_html(div) # if not self.rq.in_rset(pid): # self.rq.add_to_rset(pid) # self.rq.pull_to_rlist(pid) def init(self): count = 6 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) # threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '', 'types': 'noticeconstruct', 'all_page': 2 }, { 'categoryId': '', 'types': 'noticeservice', 'all_page': 1 }, { 'categoryId': '', 'types': 'noticedesign', 'all_page': 1 }, { 'categoryId': '', 'types': 'noticereconnaissance', 'all_page': 1 }, { 'categoryId': '', 'types': 'noticequipment', 'all_page': 1 }, { 'categoryId': '', 'types': 'noticeContracting', 'all_page': 1 }, { 'categoryId': '', 'types': 'succonstruct', 'all_page': 2 }, { 'categoryId': '', 'types': 'succservice', 'all_page': 1 }, { 'categoryId': '', 'types': 'succdesign', 'all_page': 1 }, { 'categoryId': '', 'types': 'succreconnaissance', 'all_page': 1 }, { 'categoryId': '', 'types': 'succequipment', 'all_page': 1 }, { 'categoryId': '', 'types': 'contracting', 'all_page': 1 }, { 'categoryId': '', 'types': 'buildNewsConstruts', 'all_page': 2 }, { 'categoryId': '', 'types': 'buindNewsService', 'all_page': 1 }, { 'categoryId': '', 'types': 'buildNewsDesigner', 'all_page': 1 }, { 'categoryId': '', 'types': 'buildNewsrecon', 'all_page': 1 }, { 'categoryId': '', 'types': 'buildNewsEuqip', 'all_page': 1 }, { 'categoryId': '', 'types': 'buildContracting', 'all_page': 1 }, { 'categoryId': '', 'types': 'directPub', 'all_page': 1 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''济南公共资源交易信息网''' def __init__(self): name = 'jinan_jngp_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'Connection': 'keep-alive', 'Host': 'jnggzy.jinan.gov.cn', 'Origin': 'http: // jnggzy.jinan.gov.cn', 'Referer': 'http: // jnggzy.jinan.gov.cn / jnggzyztb / front / noticelist.do?type = 1 & xuanxiang = 1 & area =', } self.rq = Rdis_Queue(host='localhost', dblist='jinan_jngp_gov_cn_list1', dbset='jinan_jngp_gov_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: # selector_div = etree.HTML(str(div)) response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) title = selector.xpath('//div[@class="list"]/h1//text()') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="list"]/div/span//text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() else: publish_date = None area_name = '山东-济南' # print(area_name) source = 'http://jnggzy.jinan.gov.cn/' table_ele = selector.xpath('//div/div[@class="list"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '济南公共资源交易中心' retult_dict['en_name'] = 'Jinan Public resource' # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, categoryId, types, page): try: params = { 'area': '', 'type': types, 'xuanxiang': categoryId, 'subheading': '', 'pagenum': page, } url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/search.do' response = requests.post(url=url, headers=self.headers, data=params).json() response_str = response['params']['str'] selector = etree.HTML(response_str) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print(response) print('第{}页'.format(page)) id_li = selector.xpath('//ul/li/a/@onclick') if len(id_li) > 0: iid_li = [re.sub(r'.*?\(|\).*', '', i) for i in id_li] for iid in iid_li: url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/showNotice.do?iid={}&xuanxiang={}'.format( iid, categoryId) # self.load_get_html(url) if not self.rq.in_rset(url): self.rq.add_to_rset(url) self.rq.pull_to_rlist(url) else: url_li = selector.xpath('//ul/li/a/@href') for url in url_li: urls = 'http://jnggzy.jinan.gov.cn' + url # self.load_get_html(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() task_li = [ { 'categoryId': '招标公告', 'types': '1', 'all_page': 4 }, { 'categoryId': '中标公示', 'types': '1', 'all_page': 4 }, { 'categoryId': '变更公告', 'types': '1', 'all_page': 4 }, { 'categoryId': '废标公告', 'types': '1', 'all_page': 4 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] spawns = [ gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''安徽政府采购网''' def __init__(self): name = 'anhui_ahzfcg_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html, */*; q=0.01', 'Referer': 'http://www.ahzfcg.gov.cn/', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='anhui_list1', dbset='anhui_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: # proxies = self.proxy_queue.get() response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) self.load_get_html(url) else: title = selector.xpath('//div[@class="frameNews"]/h1/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath( '//div[@class="source"]/span[1]/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = self.get_area('安徽', title) # print(area_name) source = 'http://www.ahzfcg.gov.cn/' table_ele = selector.xpath('//div[@class="frameNews"]')[0] content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '安徽省政府采购网' retult_dict['en_name'] = 'Anhui Province Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = ( ('pageNum', page), ('numPerPage', '20'), ('title', ''), ('buyer_name', ''), ('agent_name', ''), ('proj_code', ''), ('bid_type', ''), ('type', ''), ('dist_code', '340000'), ('pubDateStart', ''), ('pubDateEnd', ''), ('pProviceCode', '340000'), ('areacode_city', ''), ('areacode_dist', ''), ('channelCode', 'sjcg_cggg'), ) url = 'http://www.ahzfcg.gov.cn/cmsNewsController/getCgggNewsList.do' # proxies = self.proxy_queue.get() response = requests.post(url=url, headers=self.headers, params=params).text selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(page) else: print('第{}页'.format(page)) url_li = selector.xpath( '//div[@class="zc_content1"]/div[3]/table/tr/td[1]/a/@href') # print(url_li) for url in url_li: urls = 'http://www.ahzfcg.gov.cn/' + url self.load_get_html(urls) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ # {'all_page': 21580}, { 'all_page': 3 }, ] count = 2 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # self.load_get(base_url, page) spawns = [ gevent.spawn(self.load_get, page + i) for i in range(count) ] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''南昌公共资源交易信息网''' def __init__(self): name = 'nanchang_ncztb_nc_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.cookies = { 'ASP.NET_SessionId': 'kxgkxo45v04bzs55ie3tib55', '__CSRFCOOKIE': 'ad60f543-41c8-481d-b0cf-accadc73c516', } self.headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://ncztb.nc.gov.cn', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://ncztb.nc.gov.cn/nczbw/jyxx/002001/002001002/MoreInfo.aspx?CategoryNum=002001002', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } # self.session = requests.session() # pq = ProxyQueue() # self.pq_run = pq.run() # self.proxy_queue = pq.proxy_queue self.rq = Rdis_Queue(host='localhost', dblist='nanchang_ncztb_nc_gov_cn_list1', dbset='nanchang_ncztb_nc_gov_cn_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, url): if url == None: return try: # selector_div = etree.HTML(str(div)) response = requests.get(url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(url) title = selector.xpath('//td[@id="tdTitle"]/font//text()') if title != []: title = re.sub(r'\r|\n|\s','',''.join(title)) try: status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' _id = self.hash_to_md5(url) publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()') if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date, title) # area_name = self.get_area('福建', title) area_name = '江西-南昌' # print(area_name) source = 'http://ncztb.nc.gov.cn' table_ele = selector.xpath('//table[@id="tblInfo"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '江西省南昌公共资源交易网' retult_dict['en_name'] = 'Nanchang Public resource' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self,categoryId, types, page): try: params = ( ('CategoryNum', types), ) data = { '__CSRFTOKEN': '/wEFJGFkNjBmNTQzLTQxYzgtNDgxZC1iMGNmLWFjY2FkYzczYzUxNg==', '__VIEWSTATE': 'QnAQyc3/r6VenaNsD3N9NizbNQGw+P+E6kFiFSJ13jzqjGk0g2kwB5V0Kntc9wsMi2BWrroLA5x2G7wZzFXp9cYRvyfq+9DWkWifNlNexQajTfAAwwxwTYEs1j2HqAFRKPzmOEuCPkPncRO+t3YsexHUWv6gM1MuoFbn51QT+ewUuY/8FZBMImfUxklIPGmWLmfiMQJiQrlnC7z3sF5RmmWuoP5MQDORXRoqwhxWkWJjiI9YJaACijKj/p2k4l/g+m1C5h5MENVN5NBUH6XT0JKcX6U3x6heix6GMubJWfhEXa+naDfzt8sphrsVQqfLxndHZ+5y/L142pUh8NbguhSvJPTSIamouvOuXA40FfyWO480dj5KxhiBDh/gvHVJoEfBGtQsxkguGail3QJ2MQnH3rtczO2koJZMeIQ4v3Od+5aHkTYSbdm8XALJfrZCtiHQV9IwMoPTW8KMBOb2nib0tGvUYfP5h+RT/wVF0z074bLI3JlOdTxLthYBxyHXIFZCbiOawjKyKe9Vn6cMrcuVVICO/YOxA+nNHvd8yJQ/qQ8aUk1KbOAG1wAWLk1HYxVwg3n+8fsL8YvMvsJdYTVcAPnLyaIlKPWMrOEM9KcQa5/2PgXnbYBgPOO30mSN8qLO2tbwvZbUOlVKWeDw5eaQMkmDQpRO+ovhhIgHDdEcYUp47n5aFatq0uvJEd7/uICRGoE8lz2pc0ln1OhqtJlyKzpsA+zdBLmiQMGLJBzmGr6aGxd+zinnMApyqM7d8Zt9Ie1QRg3GX0a2ikdXyvsAi1UUaqDFaB13fB5mdKyrDexEUlHXD0HvhlRW0YD0m3hWgauV1OiEnj3w3Ju0j2M1tlTAqGPDUVFtkoFRAs61gXyihiNt2tnJO6S8XcFeZP6rPxH3YWeEdLMW2TfJDhOLNKeiSB37/sV5vUX+bNEr0z2j84SZONGpBcXrLq7WF9KKT8eAZuD4ATW1gN+BaCN2hkuBv9tpiEZoJTCqaYFxiWbRseiEe1GHAX7fohZbDkiwsASGCPtiahtWXxOOBPqhttRnpv+aguP8OdNNmdmQfL0MQ+XDPN7W4sxAqxdHoGf0lzUu8KYfetXfK9oWccJ6nWG38U/gKAnibKAIou8y0qR6mduD1v7UVXC3LSjNHSTPkxvbsLNJByq5zAeBa4HWHv046/fnz56p3ViWxV/rFiCdmHFfL72kH/V2zSNOCr1ec6J23vdZavplR4r68OF6zFV9Y8uFEPmtC4klHe64n+aqOtJs1vmA0XRCzGJ08AwfdNjSbGGpv1i0HUqEyQG2NCvEtfIHNCQcvCxU9rb2n920DGBN9aVF2LlF8VIVzDQq7Qizh0jtOQ934qHVYKjooasIs5etbXZqpxdNs0NQeZt5GbA9ysXkVqNUv1ca4lgVPy/smJc3YM64zPCHpcx6TucHMik26ksSTmaY27uJDgn9ihU1JeUMW+MewDVHekqsYprbIDnzYzpHTSZmA3JjBY5y+rlpES0FIROzzj9Ng8zCiiLyccFCCAEZf8RrcHiGUU7j7c/ljgh6b9PrFDQDAWcz3Vip5nGI6+Na4N7QmpawwncKFKUFfvYnmyzaGD3R9eCx5yKo6oIbo3M/4iIZxdDzF09CHZQd3WnUhNYcgP31ScXNzev2vY4LiBZidS8dbNhcm3ZMs4DeweLcDWthEWn2IVhqMZALreAbmx6cu3IAPrDBNm1e6kbguNW1UZI+6ZLK9YTKe+nnTN6b+4xk4gBbjXY8MoKnvcCPpw+M/jt7Ohqx8JIFaw4mEq3M5FY/FpRz1hQt9KkNfBvCKBIMZYJAq7kxJZlg03klgju2s1EniNP5UU6lTR7qMBeRCbdbvR5+G6nFzgM+G+YKNPiCFty0m5ab41lp9VkxXJ6qvyqCKfdZ7a1Hz8PReKqih6NZbP3dI6Ls3sV2JX30gq6u2XsjsB/wIbpv9/Lz6RIZedQUagmirtMD1ZfKWoCQOAyI/KBsSbLcOxHkgaMpIC3L68MNfY3JMJy2lSZfkwME0m4tKG0midI66EkcyZFywWoMnU3GxZtX9oYQXcA8sTW3O8c2xzEgE1N4s+fBluMMr/cSK+IyWswXFxu2yHete80frdq7tA/D10U9IBsG00pOqkh/z19FkjyrtvEmh9RbMitaSdiUMMMRzMDSUxMVlSAE61SxJpcaCrGi37Em10w1G05Pd8SKnVFKfb28B/enOUT/wAcQGyHy3IKlf4j+T4L298kYq9dGJjIxC3KmNZflc3y7gVeR1kOvOvnL5pyycRoFEGYqbsx3GFp7F4kLCBXHkFfrOzWFyrmE14yCeyCgGSlGmEazGtYQfoFeEr8lAD8HLOcss/Skwsrv8sglS4RaaSAGaxkPFHAOYpfAtNcus05Kj0z9vzPQmoAG0K1N8UJ4FwXMj7zvf/+yq/KI9GyyXZOEll52Zjsf2ojFP10nZJ1F2p4or9BYNLhAah+SDOjQHdSi3DBNRcies9qJEU9w39mH+VwF344u+tibtehwNv6Hr+VIFW+GASRsXxnBx2+SxTv2g6mxx5Keg2l7DEglcnh72/XmTDy9KrSrFiCSiwLoukwlD1E8vEguf4kAqluumMrRWRjWZyaFAYANuUN49/YZ6h6n3c8sr5zS9ceiK7TTRrgq6Hnd4K/ivoeHImyHXuPg0/EJI74F6J1WQxHLwAkKo1KE5O2c1xMLJV0deIAwIi3jEXuYHaUF6PzNoTFIWpXGpEVdckUx+Y6YC1ty1H3Q97eWfw7am/G1GP4WiotMFT7Iri72ORNrv4U3/LEljoST8CYwlesHDIERDgavp3UHl4OXpoNX7rPwN8N/Vvm0bPX5nLmlBGDhOhyqs2e4g1cAQS7gWYh5KTbEmSi7yQM5fIHPnXD94amrIFxxX0qwTXowXZlz+pJwRqJ0l8fMLFucYgZznjsV0aZFbpBRXTNfhjwY7fI8miJv9cj/ZS3W5RgVEjZHOkxwjf0MjYlL7Pgo+zln8tk4BvmceBh3obTb8oX1D0fbQf735RsLREEBkZwPHY8nw3G4KBNBqY625GsAGkQdJHwZK7mxt1PedLc7tNpELjvIug+RkNJCOkVX9ndhvw/+CAKgmDGNwa2/O9+kR+D2FZQ/Bz74RVkfZVwauky6J9whzK+pmp7UW2qTzzz1P3XWkq4TDWR37loewkm3M6UKhgeXH5oI5f+YB5kbpHz9FwVxdjAo4s3gUprbDdaTiid422W0eWvSMmJ3Qmjztsqxpey4mjC0QSfxum1eODI6zljomjhtY0i12XJHeaDkXuBAm4NhWwc0R6EnMJYS1YgkEJTpjUEiRfnIZq77FNMr2M+l+MlbW5cv43zB4kNI6/DRJuw4eNw38KulkRHo9lyMoqV0404ZLC7MxiOfCeIBOa75UqzpHo35bHG5mPfe+JeF2CPMMFZotXQfOvQUS1CSSC29XC4UrLGEn6wu9HxSb8pXlCMx4XuPgLO1mRHmS0HN1Zba/LJvZEUe2wZkUkw7ubRkt33jX9FBtKU00z4d/l1ZLeVudwidqzU2zF5JOmee5qrvCyog7YSGJj+G/iUrZK35KFZzvfKwmLYBcEvJS5cMb8v5v7U62JIvvTFp2CK6NzMO1ojBa4140XWunFKriOSxaauZeCuZ7lcw0GYbu6oOSOQCgsUEaHN5HmhcRkV88iI2TbXi4PpTN4UKqrMnbU0oA6hjPrTfOBM2EbU7HEYpi+P7Sa36i9uzSila5BYC0n3Fd//ai88+QJq8e6mB8N4p2V+orF6pnqnYH/+YLsW8DrIXjPaxgRGfdjSdcGH+BRlOc6BdvDEBp0xCLlIgeMkaGW2jR9uyBAFkJcvhKPtbXYmFc1CMtHcO0yrLo2CnCp8bNRpp5UedCGz9EHycl1YIuT0KVwYvyo/ngPHJPSzVk90j2GYFZiW3+LklW1RTvdOUFAOq03DlC/PFGw1piuxyeipPFy0onVxMLDovHIiKXtns5uN0Nrzv1UMpRpE26itdV4tRL2IvAjxcnn9eG7+tGlc1pO2NvbhJyfGTHfT1NxC+NZCzR6iOXRwII4apHZPyD+j7Hold/WFLS1hF9FRQ6vjAuxacT/Sb9p8kCgOW+O/rhaFx3LXj2WN/m4ur8wMSUlF3HpYPlIt2I4U5MNWtciFgnCegzNjLI98ja3pGjRcDo6xohcGkfnE648wF0mW5pmQLyFjdg7pAmDcQD3anZANklYVdmbpvvXv2ZbEWSkblFq7ExOhhLgilAn0bJbn810e0XPcy79zx+XIgc/i9qul3ATMeHzf54tXisCRtyml+fQvBW/O8+ofLw9470P2fALTAjyJSYFvPImWMOgDb49Td37OdceJjbfflldDly+XytCnhBNmYtrq1TZQrU3vxNiYXTwb4dJ7kDTET6h+Xwp+MmeoM1guIKTm+V5SAZVGij3io0Yrr8BDTZrt9OTitJg12CPCIx2JXdT8HqFigYHFpMUDhh2Xi/KKelqDHYbK+IjInQQCaJ8dmBi/jWwhgTdc65/y/Q3M3vQS6h82x6Y4gHlcxJy76jE/j3jAOJ9EIE1rqaQp5Dc+/Pc5g1pNALVSowQXN4f2IH1Ipu/evu3H7SjBBu202GuqVeY3xd1f5K8JKlgLk3zwMn354FUYfRB3Z8Vwp75JSInoy5W9e3yHnGfNnVUMYxb0T+hhBclbKPi1UtkoEU53QjMUiIeMLvYc1v9cElgiBmfUVDKkVTdwmiSU6miryV1mAVvMA7RshKPi74AT3kCnZZghMWgx2EJ8bZJaDGFNqukrx0sFz7+zrZEuTjwOWLi9HpLOX2dyqha7o5sndKHa7/nm4avXKESazFcHHQKUOxLjdVX8/VUZygXtDctLe50TnDUBySZ7P2DHkd7OW3TnV6icVLEVCmwMF7eMcyDC6R+TdzOgEFGOF8qv3EH1k0VEORnprnYeGUFoXkdc52MeL1NZDfkQ2VzHH1KSC18BmCmgaXkVU21nktA4Lr9nSXglHhY34ribfm/CY4LF8cc7clx5G17xRKLi0sjwfDu2Ra8VMVZCHjEJWljqHw5zMFkJDhNIRF40YJLr5yEg+88bglvScyvNZN3VCUtnIJk4Y5C7DM2GdDA68q+7BTfGoLOcwpGrAuWFesHhx6oXA2paZykfXNzZzUtyuOoHt3cws9L9jACYr9nQZGBWLVF1yUm4fWrfBljMPvRr6iaBzgRUHtbhUt2rICUJXcrrNKsqGhvLLqds+I+p5gSkaG678swB9Zfgr2ipluO4yBv/weSMzbiCswv6CRPaQdHek6mxxiEGCdqgzLhoiijQM9DVck9VC1Ox/eLwVoq//YAZcoFBZ8dgN2z8V6faesysLmomW6ZxGTc2t+1PPmVl23uzRZaoB+RPA2PD0DArK6BX+iYhSOTdXjslo8pebg44/nOGt69pkABVEBGfGadQcOHzpScZmh3A1LM9EeeRihCPC9gE7cn0qTncKOWz0Qs0lAN6TEx0+zrt6tgfT7DDNLj8GrGErfHNE3pKYDF4q9O2S9hVj8+5HF9zf55t86xqYRpbDVS+2lGJma73T3MrBBCSjyPVh1rsBA3v5W7Ca6dPC56WUaluUXyhMAPrmIoPMo2/AcAhHb54lbceEGG971sRpQYhV5kHYtJ0kUPtyegW64mNkhb0bi6MB2xNVe9mj6gBNxqX+h0e0p2CLKgQpsd+9PmFslJ7wrn/v7xfkGXM3YpKddzU2VL8as5bdMg5kFhv0kXUwtyrKOQM+WaFrccHQjU65tWeL6cGBLrfN1NYYd8bZR90vVXjR169ztMrEOy8ei/kzb/mlQOJoUgAPKFQXnCmdJdvnS+jyUxg4fO4IU1tqMXiKwmpLx6PIUrTrSOscUt4JSlWmM5+GQsGo1BB24Br909udaytL2f5JuBaHwUhjxhwv4bjsmMb41quhcWgr7XO8ycvAhCQPCexYC2C6xNJO3U2JtLy66YiT+EvTEUfCbbukOqRjRKv3i5XdR6p1fdA0ubU/FIUvQD8RSQYai/iQaj3nve6C4jLdaGA5p5QE/Rkx6TSyJ64jGqe3kq5K4RcvFk8TZ8dDCxuXzbZG9zP+UKv1y85kV+4XBDjcaeOe2zYu95uyGcSb7a/wfnNtsDg5uYwGsFWRWgWJKS9kXXtUAots+bkNzp3JRm4+ljntec9HisKPFIdxDCWlqOfpaPwgCoPLwvs2F0ZI0TXG4dihmnI6Bwm00NWbe9GwOCje179Msl49rED41zLJwyk6J3E+W++yBAajlnnOpx63oNQ7Pl7a5kEdSXtD6tx0SKbyGy+0iQGt1KKYOYYfB2HShdWpX7Lf0PkiPnrAFM9Mso5e0lXkUftM5q5J/WmIgBLECmTQAkhr93jftlM1HPcJycgEJYdE30fMPXcGQYcOIlecuPxDul7IiVA1/l6qI3tMxYZMXaqvpyPzgdrElxGTcG62faQ4fgw+ZJjXXfogz/NHeJUa8F4avUlWMJ9Rm2mDcMFryXrTS6Ul2a+39W03FpHCYLAGjb6RxxDMrO38fpfVUb6I73WmH5MhcAo81KR0lxWo/Ue5z9bLVBr1/yz7ZpowRONNlcQQQ0pSkGNbbHb0d1f/dUIDKxT7JwHoOUqzps2HS068laQGB5fphFdGqyfs9ZB+w4n8Y2KtYFjmfRG1SkYeqP3EcWfZ/QS73dhq33H3ijqkz7oLSomEmMnav71QjSW7JO/Goes14+uvOz6Kp4EhCEPxfJVcEgeKQjZkt/0kUUs0h9HPciZ0bma/gGSBxnwK/ktSCSkRCLFoPwSuY8VoxFs79V9D/Q9PfTgpJZBHqy21JDS8CVIfuHRUPz9aXoQxCbn0BKUP305AQWxu9J/TNPIXERQ2mjt/1OfgbcgTE9HAphhAGzEOLntaCZRKKqJ6kXN5kJpcg+NBUlYDdta3GdhUhswokuvvJAX/TYE0VQgzY5bPAQwvzDrE83v6cA8qmxMCJwHx7jQFHmHerK/zZJEbFATZA7TG5beeDSnR2l+XL4OkjQ0CGudJXMxGtyAqmonLvlwTcMkHfpQ5AyVLzAAOOr5O284q9aZLFdZm8epvjJI0hk4M48AC51DwQM7WJzK2aas3zVypwigBdpHY+4jy1T3E0BKvQjZiO8smn2CyU8MpEOUQGfREvYY/ug1Az1olxpqBVB67NjsLrTmMdQ948iTHezJXxOIDFwUBb2TQz6w5jk6hkS0/KkLJgiCmGav2ShyHfILiNjOtSh6ggqqAFHksEGDD8MO+fdYHxaIDi3BWu/gW2rtqaBBTiVQqA/N3KhyUeALiOl+RDhVAideylHw+kh3n/O2k2o1hUhC3stHEHxcPLXmXoVna04LtYFfZZSTUliO4iNji8ZYgfmO1JP90E0L7HD4my4ci3H3XyiYExNMaFEGL3rRISHIzKSRUFAgYbeVBoc7JEGZr8GVGRFEpogve/JM5Z7TPLdLS4u88hP44xytxLHw2OzYf2eqCv50Ux2wb5mgCjRG0xrubj62FRbCRxLJLfF5H88chKxbFu3/Da9vzVQ77k9LCa3fHb6tkfv1RXTMemM/5aMaMb58TQlwNyAImmzex5rjTl2dMb1S8FerYTcIlIx9PVdaqxYmN5A3T0u7u53DJH32l8P5wfRH7x2hArECgzV+/HP4YUZBPhqOeWZRoF+tZjyrbQv+srU/GRx2z1IUa6/BXBcidpznDD7a1uOcjXyD7jmTJIguEXW7sGm2MqfWPhpF0nMvH6SBkR0P3nrpDMte/0hIMdH8k1EaXm9tVq0KMoGVutL+vf0dhg8vqjVpziHSxz6+JRWkHTvc7bigzYf1T78SClBUqrBewxWITkfmMTBzlI2v+xZCCopTPnqbMjze76zB7L9INdT52NsYFupfYZ/mnVcVtDmjAsysZSWmZ7IjWSC1rL5IhcCXx5f1u5TS4z8asM4gkKI0khJVh+mGw01mUoXIDqlzNF+RjIK5M0/AZiS6S4K/BESTHUkVwkpmVdrmKUEcVeae3S9RFFy8axPLIFyiueKQxo4pRJmJa8hxgohQNEwGqSTYuCHDc0aCwZS2i2UQ+dTM/vex1bcvknYan6WLqh865Ijo15Hv1mT+YfjDi3kGjwUV2KdrUTQYQ/EQDRRLfofoxMML6vpAkVswa8cdbn0VQh2VI9F1NE7epGZdlw8+zQoNJXwEGIiaigSA8JmVpfN/a63PxbVhET7dyELiZxoO7GvUqI9BCM6lGitPkGl+/Di7w2DMFHR3+skcJRsFHn1HmFDrno6KhyE5yir5berCfOnA6aWGHHkL0qA4u25DjaGpv8SvYIb9WZaM3dBAx106tMDwyOeMcnb6INa1eRabadu0/Qo/8AHtjA5dky6gvLWnLMVADM2FTGHgt3RrSA3uy0s8FBYG1nxbA+rws0Z7egCAgRfuf3rE/BTR2DKRkF3RqnssJMnJM8XAahi8xP7zxQQijSrl0OA9ixJEBbJAxkGo1zeWz44YOFdmqkbOxClqIQcwqKLdxdSNoQt6DP3JkAcfZ+QwPGPp1bGi0H7c9pbWNLdZ/2LksjrgiTlh/L621jq6OfabquWKHFzpa8pQxv6GtQNPxYk/4yGj1io6i+JzfXZVqGWDv6gOKIW6bghC+Cm20088jo3GfaGjbx3LQiQj6dwC4fPJcLnZhrSFMzRrUa8DiUj9ZwJpTBcIPvoGpywDUgDQnRuRLOGqb0sZvppRyTK5c5XkDvr/efIsSj0G6W/FDxLbJaqihp/yTd1Rm4I2St7mH197FAqSQ9B11ZdZQyAGiSz18aTGcMXJNi/koXf2RtnVC9gOVHK6ZyuFxBW+GcAG/XHqbm87Ve+F66UKKaSk3YpYwoXnQStPCbNJXqMdzzaWQCTWw/rJk4r6xSAu3O0RP4lup+sP2xI15eKQ+Hp1NfBmhmk9M+NdbDfTQDb34vln7Matih5WndGq3FeiAu2/c88Tu7uxTBy5Bi7kmd0yslnudEfw79QUh4JccdvuIKMqRqE9gu/Wuv0jO3s9OJ41b/NamWah/PlL35Da2toUU7V2GN0PonXwASLoOTQBTfbGHhMRT9FSgx39qtZ1hEFtdTlRTNEI/2Ac5FLKwRUgZL30Z+RpZf1z4AdbpwxOG+NifHWxN1DvGunpIhPUFfQp7fLKvz892Mxw64U8tu7RgPjqVx/vjyy8zZ0Hdpvq2uP+t13Cty5u3jCCjdT0uwSpb/wh5XBi+bTatrjtNgCr8FtQ9vEPoh4tEpvZiBZ0XhQnC1TwbJJk11Tw+Z1y5aOKAt6wzK3KNZII9E2pJRL85lZb/6RKzM8rFVwWsAXR4HFVZsnIAa7UQ/WfpXb0Rs4gkA29vwPdu6P0/hOKtKVd8iEJDBgCeDhsnlNZmMbg4W48kVMPDTkdP+7CdxJWteFIpDiP6a0bnWNKAF0wczFw+3Yj3QnJannss9g7u5WhavZiOGCkC1IH0dOD/yDQcprtwOKhcCJDEGeSjHAQX7dmkG+dob6DgW28wHD1EpcpZxVSne3lAsk4GrXh9ZbUrNjp+5W7/6van8cFGuzApX9b+kJduAv2u4HcuBaINfD/xuJv4QmiZlfihefrFAzP07kQPTq+bPxiRLV00hhY0UK3YMnCRv1py/m8j6Bp3tD/V9i6tFkPqmEQYiq+76Q/anCe4NtiJN6w++TH8L5V40NHoKNTweG54dwxGIe7SGeZ09YNkQhKv42D7CxCkcpHSIZSqin+HTu/EMM3Zsh6Yl56EiCdC8jxg624cG1whjwBrAruDZdeU6G0r1HQ1Az7iCuVPW2HbVr+ybTWVK5JQtRuKQb11ZvtznKqaGypAqpE4IRCbCQ6GEMSp56z77VmTYqWFT/IaD7f57xlG4s1t7NjawxnD4NoYQmaWLIGxhl2SSCqRP7qznem98zB1aV9QtM+fA7R18QtUel8+8v/I0ViPBm1B3flYuDsEmKLc9V1TBmSyZ2/VOmeik026gOdNpkR+fAGsLnCcHY744PJkG3uiidDBTwJPVYxMqh2dF0m0oyKI0JNuXpwymhnEsFdWJBph2eFo3oiiI/RCph2Q+e5e2+QMd0yyomByWCT5T41Ay1Osisr3DFlh436SYD10c000VE7Rx6WK+3Fldddxee33R5oOw6Hu4gdAKSjo/RyzO1MQe/5fdqYgZmfGhcBVjk89oAcLFWKGWpk+AK+k7qxbDwRScuFkelGN6vO8TC8YsNutIZpOoyBAkvgkkCEL2Vc97xmwDfin5J+hJjYOzBBZwEK8uvwUNHkTM4/uWnXpeuHJWAR3Tthja30v0752hx2udM42S8KqWtzzKDoRyZlXEqPoOpHkOGWRadNdHM2Y3Poj8dEUxLPtucC9Ce6Kn5kXA8owvUkwxaGU6W4y3xSe+C9A1LZtQtbOt/WkAC6WJaM9dxvqZGBhMuM1J1pKxK7hzAICfrO/mPzmDPGDDDSNf2jiF2uiixl896rJEnc6Ht0aRN3QZcfTtxJM4F/gGhZIipFQRio9YIGKK0cGW+7SlTkNzZ+1kEnBLbNQ0H3i4xTkHtIVlg2MklOuZ1hbNY/qj91rueQsNrLfgxi8OB6pQ5BaJh0dwRwcry+QMUghXk/pP8Rcee68l0wJsII6qqdjQhrra14H7tzGrREi0kipJLtQsaIx9Jtlk14Hk/7eStM+rGKxAz4+n8YyEOQh3CHPUgNDOGXjeHdl+JuVkAihXhC/Ate1bTswLtCvXtOunQe+hacjFA3nX1qvX+Shd7wCLeKuGqpKrGjDhMFgq5ZlsumPgmdpBuNelVHDC4k+hX5+lPIs24vgt3enZxpRUxUiwfw3RPTtryQlQjCxHO9MA1eLvo5m1I6AmX8Mp7Mn/RPE0r64CPWsx1P+Xkx20RUjWefnPFPS/K0AB1lkaIi+6csaWLA25ZejPLL4kojqlRbN6KVYxML7umLJZxoWj2TzW2WonAaa2m/OFd5kheRS9xLcoAqi9h/4zh+7pFLZaNhz8t/iLgYTAKkVGt2x2MKe9Wz6oLh5zLxkuCDUX9fHMOuB7uEUgba9aqKP9Z3FmOVkjiMgv1qtjbjjJMu9E5+chdpva3oDWLV/lfKDq6tqppQXVYwVXQ6POBv5YKGoQSCHc/w35LuQOkBucHSl3v1bjm0KSW5Eg4gIe9kfs9eaVQ+kROPfOyCVS6DSrYVB7t20QgztcooNkMEk7q0oP37XBsfgdUFDzqhckckWx0numJfCZmQANsaXS1o5dhpL++OjTI3F8+DijVBesuZKW+Uslu6yysllv5ddjT+vfSmu3sXNaUUFbFkJXyNmdcMC1kgUCHU9IIxlrACk2TNcM2mxQOoQ5/jR+rgyu6epBA0Sjk2Kq4UI8e+z1R0v/7UG7m209gOsIMvGWnOazW2bcJXt3DXj/xnHzZXCAMbwfFVQUP1utyAU/NN3QskAmD+Q51zAtraztsT090kjgiAOwra7buyQ88x2wkS6t/OI3HxwXVsm5/YyI57eJeMhIHtVHp2/dYmHuSXpkBQDwq4a0/B4lnUjKzRc8jsrUmQ+YpvT9ofh6xBUwaiH6HIb6ojKCM4t1vc4BQxjkwwuCYPad2gLQGmJnkalg3njeN0jaGcP01Naxvtpi5fQi6tcakqyT06Ccsc8j4PRAPmjG7KI4Yz8loHlNyHb5/pgsbgWsxfXLXd/J21JRVsWN8PBkeXu68noT0Pg8ja6TTGqFxjmW4htaZd4GuVDFsB4t7nAGzFLigD4ZA0ZLUgJ+Bfygc63RrqmI/L2OUyEFJzxzjqGkk43mI/Z0eO7zfG6OZa+SYdIK++If+uJCaDZLmwCm3sRY3cFxtyJf5H7iXCqavYb+57PqGPQeAqc5tYwH+KM0wq5yTLzFikdIJCq2FMEtqmpgIa4C5OZii4BmsM/dYhNRxvQobSX3ajHGtopkA7IDQ7LJBLuuj+fMYoL3cMBOM4Hu56A3S5p9rIQV0yQCXDDt6QdHPbH0VgNlJisteaZ/fgTfLi57ZT0NbEDQKy0GPkhtUSbvlkJ+x0nCaDvfBe2rPtd1xiunIryIg6F/LACW8jDc64T7QSyJQxbsDDvuWOYKRmVWuUOCkO8Yh2Hi9HqPdiRToPT+7b8UrEjuP+kGNP27W6YMDUHEId/6UmQ23rWLxDAcjp3M9tR4LiNxYIHOp4A4cj+1wS9dz7IaEgB4c9/toGUUySuDE07xcDrTKsrNhVOyAuYOEzwkb/xqumeSrMfeSM8QZPG/C2YqMerFGecBGfrL05Ks45UITJjH75Y8dxMNK2G7i4mqhlj4ZwmQYRqvBtBFO+uEStFbLBV2BKNA/jFF13g+l3LgbS302Cu7q6QGZJ6zD4RLuVQc38xuvKYpEiYssEpoxY2sdGslQ/pzU56Rdmr+Ft7huoFd8Sb6nqyunSfUSP/AGHDT+EnVZUIliWJe/tLxhJCpUy1DEHB2ZcV7BuD1ZzkHy4+WyNWKUGLj2l4oRQjNpL+VeZhuKHRvsfxpF+7IUQc40JwEg8+8ZgAvfhfLSVHPGlVxbCaVN90eOFdEIQBaOfl+u4gKP0L/9iJEeh+kYctv7FsUkv9kbCAY3BRPK+tey/FtnTgoPEpfc2hfKgelP2ZWKkmcxx97bHDWrzAwWRcS+SKJY7HHKT3HgaO7TdubX+hb3LuQj+B7xKmTJzEKXGPJ+xIN1GJ0ogCvfgPsjr1hZ5HZqp5NXCMuLDPws8HCFWHVn6pKqmhRyZ3G2nMHjkmhMgQSC7tjGB7TWWnZazzgvK1XvdQVUGF13rvjD55A5jlYkl2uTeqr7LCQQNvcHVQvwYokWeOfAQNmRzaMXSugBkPFQ1QPTpa4F8vYuRCHOjNayYKNYYz5yQ0KRl3Cx3R3OL98NVaQlXzpGMhc4rHLWUmJLjMPs1udie+/huuEh5r4ZfZtieTjcPq6flTM3ua6GJoQ3Nc2/r4AAN7TYolIcThUJCqstCcxvn8klKtwbOLYOSMDydFNQxSCEOS8FoB4Bdmxax4uly1BOQW/ntTqnQVEILBuoXtxsnvjiV2TYz3HzrIzDK36vIB8RptWwW3Z7kUcstzMrucyjjHpcAUCi7mUW4xcRGK2wyMg9fXyVVo5zvqGHAWQO6EjWkOutr3cJnLtq5T1HSjjCI7xJTPNm6yeDhUlAH4dD9EhVxq3f/vrBWCVNyPKESjitKe47JPbxFhTst1GR9+ggDJqtaL4xeg8GcJoOtk3KdKB6uQ4CU3WAaxyVIorp9mLPEVmaSAHm4aAwffiA+egtjbzWzWMrzzs0ZLNt2wzL2ySLxmdbWlaAt+SQWXC0xliTeSP0YcTtV8MBltoyJ4vM/emJIHuC5B3XQiBEs6a1qsGxVkMTAFRD2MEdr/dG7RmdpB9V/xW+g5ke8F6ScpIVTyup2NFDqb2mUfJ6/SA2ie373dreVvFFbthSNtwUTdZVgUAS02SfSbWZQplVHDEeREH6+2YoGKfE1w+RZDWMWFPxwt6Ygm17Lt8NqzZZO6xYT5fcYqVqye7WdiOyD44tL7bT0u2Q1mBEUTPCHIG5hK+7KyhBbtp9nrp1E6S8zSHtGTLfNYLrTgFEmPOYGgDsRvLj1lMq8mWyqRKQ+avKi1eozmjTOAalGWCj5eA/5vG84UhyldG84HUr40gLlW/6XEyKebilO35r2YwdALjUT+c2H9VRvwUC319j0KdGJbBjP5wt5jmTEWqYjV1/Z1Pbo1OL5XyOfLKGw8nV0LPay5TOAL5gTRgIBX1E8hmf53PzCVZluvs6ON8WAm8x2M4nrCAJ3sw2+v81D16T2+evb6sqRNrVBJKHRY99YjS3GhMNdn08uIcLLVc3YiZhLTgQUuaoOyxOVtKxlZDOE+Tx7VVRl8SWZ7KdkdZH5o+i6FFF3LCc78Pox2RiDF+PvHY5N4uj4MGrq+uxEsO/GQkn7Np1I/HF6tcKOdoOFfgOhDjB9/eWEczposYjSntEPSMWwa2EkESAtHGPZ5QHj9PF0Cd/6xRvIvWfWhaaEIbIHCaM4BweMUebzvsN55MnlNGNdPMtn5uIXorycD4yX2AMimYJktd09SWRjqwhOXd7hb/uPOqbTBX8n9IfXnvy13nmd9wwRsGTL//zx8TqN3W/e3Ahrmc0Tbs6ZKszXPcMRBNiYROYatVi4TfDl27SRymoM5kTb5f1qLxpAocZKvoxzMaF89YXuF5C6Ry4IMsW1SdCCTP+VKODpRNp4fsFYuRr9RH87Vj2uUj/BTxbm9Q2ms0mkgc0PLmDvNPyjK2wNDwiQ7qoT/vVgzmPi9TOfbvdAUP7gNa4DXwLXlOVAShaf6U/not8+5SdzZDajEILVeIqugs4WsCuBP6Anl/znWp66YjwuSi2VNeyvx2ByUHDmXULJKpUeVKWZ/BXgtyn7dEYHZB4F7e+TBffJNIHv83Sd2odmH1OhkhM+1JCcJjpmBeDRmMU2usU7/SJWDUGkxDCu+88STWckOg85PG6r5X0hsXYWbVeNZyuyIjTyGIguDcnD8OashDDkr8DQX0dp5BILuIP5SWDra86CClV9kdkAG/TDX9lEHogQw7Jq39IsGc77RQlKxdaHFiGcQJbrEJmtnL6uL46QQTxJTKddPRSPUQMUUCgxKwIg2AEdVzGCOouN2+YvZIeivfuxcNQLbwLJyIomyWz9a4l6YM7m1so9CnEB5Wu00k5gGvjq+z8hxZgbVbxQ1T93rbpBf3Cz9sm9QVlfH5GhHfz8Sxl4Nhg1R5ucSg0kSMnR0XAVe3M5SsKv6KtUhwcHIdFGDhEdRr7Igq+zrZtPNtYoNjTL9p2vuP4O3GuQt9/x9Z6Agr2eoFiFXJSqCfVRfsvcbTQhmp0FVNSBnHHlNvZmhkt3MMLq9w0MYptaLGmZUegXnF7uxsKBgQ/FONgAQFhvn9RIyDp6zSBaUPDu7eXRdOU2w8OmY12loKn2ObLZdcYD2f1hXbdaWUuRl7rCKYgoapdHabiBP6+HtH2iFl/Fptqixl1my11BuPN7F7Ocgdbg8LIy9b/6OMe7Kam5dAhmfy9GKAJbRPAe6hYxFFPnwYMqsGOtQX4EoJcYI/HDcBg16MtC9lzrRspHuLe8unYTX7IhPH95y+B28QJWOS7imhKHTyVH2Chk919ktOHYSVYTuBMRT93ZJqwugp5NifI2YRgRNtn8CgNIwtofJL6JDTfY3KSSRWmGM8oX0P/vRo/rG4U6XSKY9jcrA1ohBx/k4lVpPqAbsPhUZu7MOWIK3n2ptgL4JTLyRPn3BS86tvpRRhqpAmhUWYyOzAvrtzb8mDvuTdDTy4OzSYrqYqOrI9j6aElBLAFvO/N+Qsi0uDTucO0VHig9wgG+lUE0iJQgkzKyZAuwlw6rGuFfNzlY6KYOcxxmn9oDLZjDQaf1+yA4bTIVp8S4Kkw12qXiVpvdH6AcGqKvASDPJ6+ayGWPig1IOt2hbR9dS2AaVyBRTZ2cxJYc0QrHQvPM+uN1b4Tz3AnEb1NI118f1clRBgLcgYgxcphF04KEfRbcx7LbF/y1sgMRHRYaHODdBYZ7IMkIdvoypWmaAXDCX3AuYF7aO3+W8BA0ThUshDJ7HOLywrgzkxkeyiJ9IYWSLBYKnIBE58ciw4iSYKvYGL6syesCcNh6BQo+VpWRaEpdSkDml+DvgNPgttr4svD3LMVsVeHvqPrCy1znc43ZHkX/84UwA1NDd/uLxw3UGr7Qiv2laboyc89D6RQ62U5AVaRso2pcBl0SVKVtdGa2h1eSaByDfYm6sWEZkluTfJfEn8RErPcQ9lM+aPDikq34k6frFaDY5xS0tk1nkOdeqrN2XUsHP4pEw+HKd0TRt24XpgTqKa9HjgVhSu+lyoyyWnXyO04Td6zUK5cUbcb+/jSR9cy/8egSRSsmKow4Ec89KNAmnN7HljkCGXCdhpPziwKIfup+a6qRu1EYGmEsjXqKHBC4gVgIO0XcfVHxtm/Kb3egbHVe8xbS6+blhZwT03nhbY0kUWrC2fQDOBfrdLL5NdQOh7rwDE2B2Z5pwwe3lb7M+cFNx57QXl4cpDJTYjJDUxew1Ne7KIk7QYIt9EfUwsG6yfdUkwZq90GT8pvQ0Ga6bmgA/cAYyNme1WVpGRaFL5ZPSGgyZ6YaFmyP7P8/zCJHgjWusgu6IwX6ad5gw4VT3GU9FoIiBFMgwV18egvI9x803oPepIaAnPjcteWOzVcKsWk+4H5bpLMrPfynaWWhcmNEsO7UfPJtU+jNuoZFya293o/iybdLXs9QOWHhoce5IOI12G5NbBFK8yHVxl+gcs5E3mcTGC7W3C2uqXryS784fkrvaaJtYMVPS5bZz0FA8LlXUBUYC6DqQt6SyMvK/XJMYSDVwepoCZuvxKvAii9IH0XC1PveYPpdPZxg3mkMpitmKZibVN3ApvKNT4fiHjF/4Ex1k1rCcWq+h3g0lFBH7HLwyx6CyptV2WlmulrNSL1Q85q4aht3K8GfyOhdGR43kt8segSI+xofrBmUsIcqS6uC3892PtcEfqFnI9X4EwjV0VChuOHduxdwLmORVWGJHGK3l5qHOvh93caHufhBdtr0JrNH5njtN3xIWump+5HgavQr4ElZR9DdoXwNJkqpsZm2l7gBKHSyDyfwisvprBgDRmrHmfuGYlwoXdvemTFbjbPgsLXLVqCD+lDJbngCsmMymn+xD2nF7U0R84/numzTU1OWVQ0cM+ANrPkaGFsBDxAu/CJOJ/G/j0FhlOltax/IWA2Flq4zQX9aFZBEcxZhUzpas1m7YkHRVhnvZ1IQPGk3WnFHJsIIGu8dhTJYkMUNuh0DQvTxprPaHvPmZCMylyTIlO2NLyP1ezHIxE1ZsNVwq7QPIzc9vVyaz8XD4+hoJ3hYidUhA9GCW88rbRooO0kT+0fNQob9X+SuWX/T8714Zxzf4j5FvxUh7A8JhEWLmaK1OxD6um/fsQCkhV5Q+1xhhlkXj7F/WZYTFy4w2VL6BoJ8o3fFp5WNy/Sp1vW1ozMV1wWNndwoJsau8tzF78XkYrgBlkKJorxzvlQJousWBH6yzTyNTfgLiA+hutIhsGC3s2vxA8BwrpizrKTK3ytDKGXVn8iUuqYXogrwrpznRg5XO76nbGI+gIK645imxrXZpi+LaRm0zJWIU2UkCAU5MuPz/OlnfFUVg53iPwNuFBJdwC8hAOLe+t6RaiWQCDgtadLbJdA77X+IhPpZU3Gvz+BX2LRWQJRXCAfHZI7TIDZdbFXRvdWa4f7ddEYlpPCmkEoPoQgnTPYRiZYwu6x2mT3836jNN1ru9I57hzjsn+4+0gZlgGhi9XypOaS4v1Ly2TuowxfCjC3MH9xGwkApTIMoc3M6OQdmW2JD7dbeTpBOVRKmzKqiCpmycp9J8y1NKmuSGDfqCYpLkY2fJg9zUBzVfaAINfG/Di2xc5XNalm7/H6dD0ou96XF5lbkdctEx/BwOn2kfT4sZYNM9bsde/34BBPeebO81nxc4XNhArYVkkE+qxc7EL75oqn5HMef7GkRYJBAI4caRObFF4f0erwAWfvo0HS2CJmqJ/O+3T7F4LGUAe9847YuEcXg1BcH3HOGrnGHfp/3A3NJzVaJmTsjdW+KFc9FgGvmG31lSD3G+io4BeY32UZ+JBFuI7QmNQZQ8fax9Soo1h2nq2B4dzyMpOHNSNyW0T0dQ5MBjD+Htp6djBjp72uDpronYutzwo0cE+hl45bktceexd9eOp1JBbQDDVTSci/YjalMcqnQ9M4gpEVmvRdfx8v9erz7MKXTFNXthKaQRV/jWe/i0957DLKJrxY7e2DpLIlEVcOld2HDyurDbGi363skPiZKteLj7rJcZWpvO12AcG84IsLyCgFgeh0/1aHiM4OSvltyH2U8ATU6z313WI0KCe2abXn+ZdXbO2tM1g1AkJ6LAnDYaPClVOaabxDe5jZFm7YXgULB6FxfoC7GQcBa/DMPOnMMJxY/Ku+kt7gjH32T9zjIYkm5M5FoG2yFw2Qre21juDz8106bK3VWhAQVMoxB1xAWxZXWiJgSvYwKtDwllJSStaS+TgJVSp2IV9mixd47yJRucSBRbSxrUrMkv0xzpbReSpOvky/t2OAo1EWKA/M99dblLJHN769eeuzXKiMYaEBAObgdzp89daBe4hmv1dnv62jXPAkKQs5ZMWTeWqYZIkMA9ICytmvbnAi+umzSqEvaaras+xcoH7YCFqntRRYQdkvs1m26mBFaJoziZbK2/SB4itvH1SXTYI8AuG9xiiSanPG4F/k1ph01bW+gJxEtJ62nyVfqge92w93suCGupxAHiis5M7W+MfvzmWyDY5Edsy77WGsVqhmfVrxJiEJh4nP2iyCAQSfRKAow4DM/4ZiDlSJJxioRj9+s7sT8pX5BVcJZDS+Nitx9k+NCIvoQAuXV3SU9f80TmKX/z6vuYdzw+m6AD7Ml8wkT5NZ2hsbbKlFLcOiQSMY6x8+FJ1qzdPt0P1RlnwM2nIohSJEw88JOyeDabxLDAc4FRd1hLcJRFeIbo1/9vRlZeLRNBCU2mkp3/c6Mc7zvPIeG04V0vXZwFCbYnX5w7lZdo7sD4RkjgTxk9hZ3CHXlBrMI//W6u7C0XMXajlHMmG3jSzT26aQ85UUPwnvUnneYJ6pjQCzd9vZ6KGyO6Up4D8P/2n2DZenMedUpRJZShcHWzZg8/v7jWWbpRpZIc7wcESOgETq22T8MZ1yopCc03DDP3T+kDjH8hzOqkdMKnpzq1nsJXtORTpvKNz1ngJ59O9ubiKGnrDh6bTB/UkIYeF12st8BKXM4Nb5qIRe3j49qG2JCIpIY3+b6Mgv57WcceFwgELhMyjSc3BClA4xqqZZ2iazUJRiA34VCvoa7DXyWa4Lf0WWiWdWtO9tDKXu64bD72rmLXNAUr78IMZvJVD/BvdXVPKEKCKbMN30ePj4kL6TU73Q39ynvebXa+06vAoRjiOSdIsw7NTL1l4pVGef3Tb6M7yr3rC6mJtIVtghXK8F3NPJ51YU3FdpBVNoqCsn4VqPMvckUxGdHhwKJB+bhjsR9PoC7YHMTqv7pEy0HtuPLMvv3PD7I1vRWlxZgZaFztMZvS5ve0QAYrqgG8asrNu3Lkf6l/buYrpPtylEOtE+GUezMKrbnk/r/GodOqFyUxrfpCRZ1JXq74b9RIVFDgA9g1wu1wLsGvKVpxB+FDKl3zgrSh3YX40sz8Sl4fsXrBC+F4uLozk6yKile8eK+i4570X8wyvHXOvUGElfZq/9MMGjqZDAUFoN1AIAcMMWLHhzcLnfVvSuzt86qh/9exlM2k4CYYgnM/y4FHT0HLbXzq6NDBLSqau3IdgMNzBLpBBH3lIPhLzVcyFaDGnzzQw+OnVPQ+Bb7AYg11QibDUlsfpQi932GR2o5wtUmWjJheKquP1qKTJHc3BuaOTnxXUwQYe8zOI92WTVf4xwPLMh0SdC8O1CpTMmzaVqGxyDPzrc3H9xHaZ9co3lZJM3gKvQ2/u+hD7+EJzGNTJmuY73g+BVCu04IgAdUZOw9djPkhrKXTpmIv2kPN6Amb0Q1peGPcXtob1DuJrWUNSZySJ+8es8uFCUN6xkVFJZJbtQjEMoag0P4o3bqCgWYP3CZWqAeaYmwwZR+CePHvFseYm75+iC8UJUOUJ4RO9Ay9zWg13l4srKACqRpoUI+T8QOiy5TEJgsfozjYbTXkD3EHFzHHDqIxNclchD96yfDvpjuN68ubbrOxCBzcG2gAbiNIUHOnSQHOi/IsGP3MNLqNFKahQHwSAcfgFN3wrSTQGkkLrMP9p60kIYHiRrKdi37hNwkdgWLVLtYvyRdy27BepIFo240U65qLzzGe00NhNRSAIM09tNFnnTKPC1sCWpHsJYW0IGTC4E5kwonyybCwV5CEHDZMMZN/v5sXPnHki0CcvikSLyFrQB/nqQRsLC/ROcafAP1NvHIcs7MrDZ0Vim459rrt9cuzP6ohFOArlHEc2fnbfBTxUNS+aqiY/FkptaMJjpALT4RNgeO6CWSEnycAKXGBAUcTYBmdYu/dO7xMBNiKvUH7d46lnpulD7pXNMB2WEW9PwIKfjPnEvJE+KK8cE1jRZTDKQ5z7zi7BqSMX3S78N5fKyOcR3+BKcnVjaeDPCBy7XWIZa6cMlbeJsD6aAD8oy/2GHe+KwqoKa2adTIYJH4+rzS+K+xDpDBjbRw8ooDmaiMyw1LtGxM2IaT5KUElyot19G99SjjzzwSR2Bty1T9VqLljInouaz5j0DrLE2HY7znFree8AVGaHkyUM7xuCf5VPmcnfCkTagHm42wMUQ29mq8NOpH+UUe88xhha52cXIFYxprN02N1w5Gfd2O4BYVpyyr1HVdfMPOU7uCl1rbf8nbNTTu6uLlkXxweuozNJ/XxG202/g/beXlsu69oBaS5woLJRS5/Pnstnx2h48ErvxKJTJRNx7Se9/Avz1kUpVEZa7TR8FY71WpSgIrgNtECktli5jpooHKbOiy0uSqNdai1AvMYeGHRGCdXPlNxIG5SwdGmc2Wv2ZYTe4xT3wvkcyz9N28hbA9uHPy/KSext/PzNnSlN5go26UcyzptFRRecX0vX2rCC1EDGCLZNI66BSVJLb+O4Lia5N8XXayoe3z+4WHU74gsiiJBGcjJ8NH3ownPKoXX5SyDa+5cIEc+PWp/rGOi2nxvZig6yzUtIVj+wfFsdBHH4de25u8afF7ikZY328JPXv8Dj07nNufEIXpHp9ixOeiCMYrPmiZXl/EEUf+EUtaYX8IE2Fh7BO1tV4iCIMXhMypSI/TB6NjQ4nj+YBE29MwSBC+VUNGKzRJjNRqhQsnjJu44dKAYN5BGCxgCNFDNrNLk//k3M1pY3EWXyfiwxa68/9Q1iXvBbJo9/28mgx4IZPKd7DfY7qxY4Z8w+HlsCengTWiWlrMgNtz2aooO0Lh5VB4C3bVs/IWn34/HhcXiMkZunoN4VzhQIc+q4mTl/MAPnuGJSh40AwJwkNTSNkMNWB7a6CQAl7YidBhFMFqXomddRB/kGHBm+BYMgyGNcPZC6GguavtzizDjp4o+4+nsUzIEb/mXWXfOrYYNKVlzBFG60RBlvWa9r4RZAr/kYNNdQAigzvZEENxDgkRioFxq8g1C1ddnBTpZSbfWR3/CPAqv5lPwFLN9e91fhkMY23KahiqIly2j5uEy6jnVe1VVnLqLtq/TghwEuQDDtBy8VkkSyMLvacrN6wPg+gIjtlM8qdFCnndBO3b9mZAzxbyiySE4egq+nowQMG0ggsSNvVYMgfd8l1j80cMP6EthKSA5poggIWiFN4hG7+Fi4qKtZwAzeBfCfyaq9C1+NA4IcZcSKBZr19YKW+aeiwxaHYxZaCt1hjZUfGxloSGlB8SE8dy5CGoQW3GDCmymiVNHFEhiTekuBWrTYs9bTANe+LwSw8xx07JgAGIU94tbqoYVljDgn2Q7Eiw3mT70SxofKX80AOUpebFELbudKRvH0a5rl/fAwiwb+fHbRI4x6V2OoMPdBiY0XlQfeqoeqF2mO5iSe1c7d8JKB+/CZqyOyf+0x9bNCz6qvGx5TeaudPynD6YriX3ovPO13VxS3uTanhX/x3bWkcEJ1M+kwZp3WXSpWHcg0ULP0fsfhWs0q+Fqr/v/Y10fF9nHfzFVoxPtXTg2rSjmOZ9446lKaFCF3sxcwwQnBS9T/GfXyHnb/SVqzTwCpVOUpKql5UY75RKAoCFr3+rMWS2FxEXOrWTh32NCVwraKbdZKImxhieYnDx7m1kWHCshLgnuFYHz4rbU3Io594wOR+UWOWn/SnyVHPfKyNjgsgGIPECEv7CcUUc+eBsZj9W5kLvUTfY6Nd5BwKLYPW72GZS9MVSKqXWHxFwnxD0aKHXRbIjJUpcmVRMmHgtQgfJLDz9KYhRu7wtGxwxGu6UlL40tc3nZpYmecIoXRmyGfop+6fRXNhAbM0lDq7thzQiil7+BT5a8jiLo7f2kUym+HUbStNHVWRpJO4HEkAqStJSV+Bd0gDljBd17G+deNZgYrDkv2oNp+2Tp37wefYgfdQD8Lb90t3LVUXNyzF8YozCYMl8IMfR/BxNSledLri3KNb11iPByxn249tPJWCx5nRkf+deaiT3EmsOhfv5tr0Pr6Eth88ECOp4RMSN2gPv4XZyBeMX+XvfJNhySzyTA9TBciiuZX9QTpx+zunWMFIPyNzDAfzQTu4qmiC+hShPzlC9hTLtdffRgaDGfLpWulcam8Ti/I8QYU4NQNrnIYCGchwwJJwtBNVX+sZwovS3mgMaG0aQAHUfeE++Efh+69zjTCJDmNbBvPrzwz1lnKqDCIhFRJHLAFAjyBLIur80or5wfxWdHAgRAeGSYHTlGVIUvbDASGyX8qO3b/XCU96oIP8nbdzpOcrHJpQGzOwkFPtMkhDgnaiIJFQYzFde8GROkEjSn3SCBOk931dXfansNSuMcnuW1x+gtS7On7fM2nuY1JN1gpb8o5tIhkO+eyB3Hv6JDCw2h3dIN/SGZmgCSIULDj2989UWhpEkTWttTCGV60xja0CpJDI8+vjuAbhMcA+yZiUNS8T84O01IPgYJW+p/j5lRAWbqvxw+riKawqtPOfkVHyCvKQZK5nvRoO4AWUugSY/yM6G/GpT4J369b8n7b2kPYvuTmRq7rUKs2Zn35qNDbVj7XoXMxmQI9QkkT6f0K3Hji5dtCwn9KWgamJyUXU9QuDC+fEN33oGkIKtmKT6AYZI+n0r+1qvLe1+GhE53aWc1t/T2Lo5DtP128H7LGgTVDjGVIvCOJl2ig0wmKHMAE6KTvSTRvhSbTEy8XZgvlBkGVLqpfDx+IFr+l8oh1HsOlU1wGHTB2/rcpE9kqqu8BfUKRYH1zT1YA1OcV7iN4fELfihU7Y3TDZMPskHD91wyHbZfcvG7flVeFsZrfqT4Xv6Ok1HZiFZ1EMqcyl7W+bc2pYCZJd9vxwpdtOx9BjRjRRZemXYoK5HAIAYAolkBOYmwMVwvt0rZXRf5iw9qgm9Xbci3772uJ3qm8xDyxUnak8oMcBsyfcLu/3pV71SgswxkjGe4LQgFA7IO8Ss506OhTaD+tJc1qevRzVLVNe0g7m/V/WmjQQHi7VuNil1aU0tCrmyPj87jWC/cCjosDzAgIoJpB3hCepnyGuAt7vEUWG5SXh+Q0QzLuJRzagsNbai3L4PLmJvzqhIHHEMb2j7wXO3y0F3uBlcDxZbSvcz8uXP/uK+Ad3Bll2wHolzy6ZVdRkCUukMaL+K0e91XcBEZ7fhra96c7SxXrb0WDAObxj7l75r4giupIzumWiTha52Ei3HUHPxQgN3y+L+b26RKKSz4AUlUK+eVeYmkX19H5kCgukyaflIpHGPLuZvWWjyt8UKhDdELYBm56cU/zuVdV7sH5mA1wGZnopm2VOk6YSCJA9tA2150XDeYDeNclVfyoVZ4DUO41TUiF2Leh2apAJc9zmo/ynDXck/O28HmjH1XB5ZonuV/y4QXXL5eqdnKqBm494quk7PY9T75+jBmgd5TnadPODRKoRJbHzmQ/8R4P3j98Hp943Kzm0QKDjUeM9q3OjzJgt+CheqtvIOop5p528CMughy8fcu52kotUMdzuC6EWa0Bm30piWekEFmLimsSq/7gFiyUOSCA8Q9PV+HosWeYTjcx660zHEUbu22KSql4WYlOKXrkPKySvEx0v8nOAlJY5L5jvkY8igUoihpIcTATdsCuVcm1WAn2OuimTk5pCTxIjzhHrlYC4+Pb0Nm+hntnS5ZNY/n4kt/l7/z7aB5wz6vry0/sMDxqNajxG4Ux+rNwbCBUOQd2lUJMcHGkuPfwNYzNXYYIqJwR32LBSZ4NUX3RhctcryzB/2uUOXec9sm3DMaaVd5H2rGXWfiZAh9hCxdsI23pMJyLklulUy9TgbKBsOFnPxSExYvqv/TfbELaiY8on8qtvHC5lmUit1MnUXKlIG5bI0oeoBm4sqzUdAoNW8MfKR+px4lfzsPgXbO9DzbUTneBvz53n+qvagOKvQkhHbuGpnhR67CeMi263RMX6cR2UaOxXzpc46MqQMsyMb+M8ug223cCwr6tIa0O+ks+9xT2o4MzlcLwTLrwmCyHYOm0wmAc/qioZr2Yr8Eyr1WfJsXWjR7VyN55vEhJyhcnmyKHPSdgirhzm4X1aWkYITvtEFvelwd/UEOVR1SIJCd0GNLEf8dPyGqMoklkJ4yIw1MvH8eEOV04t7x5Zq6jHy/HLl8QsmU1tCaK01WDHXg/eycvBuL/ToQwNzhMnhLumqLpeTOhVlCpFk3RSwmsxQiu3Z98Vt5fP/BaOI6ousUxlqGHIsM7TaAoHIlKSOQH6Lx962D4sd0UAFj39PKLjYhId2WI2LAUJexw28uV5L+SKoN+DsFXJzEX+sx1epZ6qx++1yorI6ayS0OcOCYAMzQ9zJCRbn/Ai7F7fmO42Hd8YUPYg5tvpANhJumGVvLLmwxn3jKk7+8+KgckeSChzplWLE8H0V266nZD//fWoYlOhvJajSnXwYVrTkW8ycrRpn8Ifa3PychsMItBYIUJFg1dpLshZyULQnZjl6UO5qZn9YJjVyViXqj3aiUmTXddlxyyB/DmHs5YWYJSd1AXsZVLNOlH9NDGwKiBHvlpkLcLPiUWIj3Rqmfx0Bb9Wz3hYhiRby7vvC+Pmx8xHoFI1Z46dpIlkfeMHFYBMfOcebTdVDyC9W4NGjAz7mxwoLOPLoS2fb9Lrx5GhUSjaC4X9rgdGJeRhGr7Vz7ePQp4Z1IPbRNSPRylAHrCt7xWLbq8BSi94588IGPKjqnhsz3JQdy31aJSVlhNcAdoiRAygZ47rouEhDpHCFwlOHxpWrJclFIpmoDyDeuoOYpSom0dqxJhcFJi/udohOvdpNJiEj+Zs54c3y7sn6BO/5b/CRuPSDWz9VbR//sZy2z6XuZCMHCMd0GbimBwEjLSG3qAPFi+u5tKoqLT5wd8M0AGox2q6Z0MKP+Co5Am8Nc7ylKAuBkmDS2R/Utq6aRSbLjHI+Psbz2AVvhp9ZjNqM63Umzfm/RHIf+ZzquXNCnRhkWy7XHR8hy3I51EIP9NGj/pzr2nrz1tVMeNBOaED2cabiH0Q2yV+D/3mM0Igjc4R21XRA+Nqu1FzB0A59TlN7TJM7akl6F1xVRG9gMv0JhDO7QkFJn3oDEaMbMaiPvmTHchA7dT6fpxCHHg3++ZcgBc+S6JRO+CdYGpj+6QDTi+h3vhPjYAZEqxNOfBEQUJjG69Z+2/+IXO3wzvjFA5bmn/53sa7PukK+KLubG82XflC7I0hYaCbFOCDXPC0WPxflXP238Ej5JYYV35MenejbaKRdeDz47PKPmU9NuTITVxGgS8FATI9hLXLBPtjBiF4FLVGfWMmNU3MJxF2dEDt9wLUO5JJIsf6/9DmeU80wYk4PD078yexYnVMOi6Ig5oEZ2bUkPYg+X6bUDzj5TVZEMHKpz8c3mCTYxgnwrUxSv9Xz4QncpOdvwYm2k3rNhKQPCmv/CjLG1LPhwY4yABFRHOPYcanQr2Jv7pphU50S3ew7tWfDXKI0ZAe59eEW+FsT+pcnLDpNGqG4A48askhPa40ZcQzrar95IfHwCym8eMRWW+FWbMy2Inu60Kfl4Kr2twq7N7vZYBqItIFwfAzhz+T2HKqCTAH+QBi6kwnUgiU1jkp+nMfg0KvDh099ug7MFjxVf53cuFokhgCNl7HkJbJKCMt3a4iiHftcdxQYqXqUNbSo2PRG5iuRskWka4pc+UG5sIWOrCVBXDXy0BT5FvmB66JRKnGp8ixdQHS94RVADQL372NsBWT4vv2O/BowpzQcgu208QykGFR5zI4NWvmKvrW6jvj5cWLoTMHuRiaPJsiVOTFV/OV0ZXo7SqOtG36xmMAW1oP9JJhVmxlGJI8hDnMKwqZmifGfChNovGkSyUmxbBa1BqudSnWjNes3tz4ktFjXI7LL6v5c/FpHIhnTNy19Gb4wkTw4/bZPjmHvir4T3xrcqn18P/qzQyEeVQePhpaUPxdx3ZLmhko7nuq5/fGYY96//oq4ictZjMGNmm8Bo83kUxxoc0QKS8Vpg0zeKhvArDI5JNo7/1sLd5NeZW8wDA7KTmsA1UaiEHsGFXl4M6unB+r0e1FHszLr7UFTH4tgFlOCxw57o64ftG5cySzckHqKNHVoHNQPm3/1/n23+sXqUo0FyJVRbMCwD+ak0YNdI7t3Jibcf60B9KVMAIL94Qqi7K7zn4Qy4gr6QHEKPddmM1dnRKIKpcGUKD4BGKcvQmL4AdEEuOXkbw1uTKUkGuWzbPWCNJje3GcGqYauvjgQh06OYPM5KMrZtLTvX9AdxQeddS/1L+KqDsyeHT0aGXzqqufyqNoptSmHVvw268l1UxUaQd5ujYz3FRRcqUvCs2a/aLkprLGiDnyU15mVNMv82q2VEtsZIby5BLhry9Rx2gf/CR6ma9hFdbpVKMHHwNeWNoL1Os/lRMcSV1NlmdJlNUvoe7XDs8+2hFu2/bOghBG+Y6x0hBwqMazKT82ij22diS0e8IObTKVxemk1YxlV4lTjgMx9uHlqQhIH0DBBx0ljCi6bJnNbXvyvMUCwhK9geNFybsw8o2hwum3vBRrdKgWO9Lb6/9RXC5o/jy1E22uPP4JbEnv9BwLt4pnRqUlOosKZo5oLOfU3am1/9R5CcEnn8w4HO67qtMuuGAc4ZY/NY+6z2ghpBFNAXpj3RjgUCinFhTvqb3wl6Qa2db0GFomGfTYbz9+OOWvfLX4ClBaJ07PrtpkGeEIhfSmRbmmo1XXLoedw4/16oVbB7k2w2vv+f2LQXpyjB9NIy3t3FAlbkSBYY6+lJl1H1RucepZygDfXeypsgti43t1jB302LdExc00VvXVw6Pdrndwc4k2FovtxyD195DhOHHtfegMDc78kfEai/I4shk0z1MevqR+AhSwcbDtQv5DDYjQJa876dd0NW/iRo1pe7HbIL4V10Bt76RLYfm7XOXvJFDe4tMOGeO6ANh4DjlRdpu4d5oKXgsWduvWoEI7FG7yHnuh5AYmeshuJTEM9ihLeKNMvVLJV4Ic7rBbaI6epiddy9Q4doyBslSRcqbwGR7FIkEJAnDSXxphQ5AJNLvBJb4R6Gu4qV6bUWo0j4PoqbJc9thlj9Pi5pxZQoAcREg0UJ9PfH+QbR1pCKrLmj8eF5Q6t8nNuwgCiYp0p1yY6pFXyvKc4vG+PvZ6SE7aHwNtCG/gsSHlmzuQk2E4/0/20IeJNsgOlGnb4LAZiVuFQkbVwZrrHAGSsZGwPiveYuUgPtxHajtFSKbQhSC+4eT84mOAjf5S5XdRJzMbslIQcV+O+8SU65AVpdqhGQUW0kDT9/XGGPdqSn8acT7e28+n+vpdr7ihdjYT9nY6HjnKcb7OdMhQ9B1s5Ke5L2/H9qP8twMkYbDGlpF1UlAmekQchUyukpxoqBs8HaAwz5PWDAiLYR1nA+UxaQ/EAPxK+KvFU3y2tQn2ml43f5sAoOgeuS3KqSJvruOoJnUGaVMbmM9q6EI4984FGp0hFcAQsodhnPLwa2uWl+H0RLE/zKMgSMoAetLsH+mbiPXeuiThDstwcnCFvjnc5YxirZswuzqptDbvJCUR32ykRgm4IsCplQqCZpAJm3wMcc9S8XMsQgGNRbKWMHMAZjqk3twBiKvivUGStILGcOKm+2T8egs4M2lfMXtzy1/jNJDYnWr2PmFRUqLwKc6ba1cB/UkysLRhCHbF/ptMVQ1G5zgBHmCU/d8tDNazhHQCBVGzS8yGUB2SwJb4JJ2OZ40R6uTg3/lMmqItpfusNTF/Q1S4bygt2xKmKWwt6NrrpDYoUZaIRsC7kEkI0ccNojV9yPFuh/a97adG13SRkw2w0KTgy16FYhM/M9E5gjxXaiLEg+oSrv1Y5ftbh/mGS4hEY5ceyLEUFflp70X3OYBghxwH4+8wFS25z/5rFKTb0s80SjPeENYj8Duy+WNy7OTENssPSwzGLOf5kaR/S25Ri/WYiwO4Zh8RkAv2qFV6/TxXcgKTOKmZX+k7owilFbDNj8o9NJGKgwF3xFIsRGSehzaEiPdPoJ6hK92oBDbFHEBGYjqJxInPJjM4Kb0AKwtMddDPKnJm5CRF4yTXjPTJu28wY3E9wO8FdNpqYV/Yzj4UREr73dnKGA6V0BBymjdLB7kYiwJwPv/Q8Fyff2Arlbnlvp1ohrGwYv/6CPyOqzzakf8UxLFDHgdSOntYBBVEHbkjE+oHomczqug8VrTE7+8nYW0zICAXLpwJQg0pXjZQ2WLSNIzK001am4gjXY7QHERJ1mcaRfR5uEc/L13pa2Z1z3Dq8jL9m7uDlQztyw8wWboLu7qlJQH0hitCqG4S/DcmJxblQPDQHFX7XY4sV1kv7nsuJhkSBeqdFo6XQ2EtjgG0YviW1BCn0TzGBbCpvVz7oAWpWiselTKoK6cbSaJr73n7nP79ehrxSVcEyjnBDThfaSB1W82f/B1EjOR772HX7BabDBVWyOgdjPbSlARPzkPd+uTbdWaG9GmAxZrXivaBpK0kRPfly6CKMt3/fw4ckTCg9WxQaoOABfMVk1TB1V3b7p4FTi8mcGuCKVStBAsERY4mp8PU/EZ04p/jhs4poBFJ4eZXu9919VB4XzH/znNFJwwiQMDD1u+HtnD61aCAoiz0Kaiejo7b/yy/AGdOvf5V333xo4h/Q0nHm4Xyz8w0YlXnImBpHvHwKF6tMNqBEsvpUgjRZYp/5f/+RzDuKgCWYMaKykff1nrWI/UQnq8wzrFVWp5OTxHJcRKJPNIPwa7deeGqtrin+T7BkZGit2bIAT1UqyHDNu5LTJXmL211NQkmVXsLMCdQ3YpeSnmSWmXdvjROBZjITJJyGbwnaln93I6TUMUcMvWeqZ7qtgJ5s+5dyADjMuWXOwOsUJ3vEkC9RZ5mAFL4reumovNsAjBCY4QfJtCMSjw20TWsNeBHKyGSZDMnIjhhNKkS/W4ERA5Lt1iRjNL54YdU0crEgbjChxvvsh0C6DN+bYRaedba0DSA46ZiZD9/DJjsqSoA4tYeWAaVCc9ExQC+2WFUFJBM7AZAUpYqcT5vuUv1oRDaSZ4rR7JXvC3pbyutPrY1uA3QTBMseC8l9UQYfQV/t5whvuS1uXVP2PWR2NJOon+vnL9kUk/Iy9L951W/YhPiORHbODUXVEC3Hf+q3TW7Vly1KyU32tD+eDlvBoMR9R8vwPI/QpVHV9V9bLGbFWPUglnQZMAaL46l0eh9FZCjTlGigReOi9LnnvKw5xPjXSk6z2BC8DW4OugrsvbHEU2eQLW+kDNynKL0GZReX7E0UTpCYWzau+dZgHQZ1QhoOACoqDAv3RoAgqCaOZ8XLvxSkHwdX0OLHdSmGsr0bnUlRPCp9bIqSDERbsJNUJglEoih+sfXcE1431HpkTSYrlggH2MqNdx/d3tF9vUSqxEJ87Hf7w0QCvSzE04PyLIYGQZ925x8ziwgM0BDX4vcn5BYfrG6fADQTAueHRSJqwWMiHzzpjQGByR3dmw6kVYShx82eidsCJ7N5BoQrYSawiNfU7KUINdN+o8L8i0EqSdW9WOi5fexVgqWTBYFwjidJ5HoZEs97G9ua4mVRzFOhTAsEu5o0KqHms0VwUSkyQiuMgmGJkA7qeLUHcj0F2w3TsD9mKTexHGyRsZHdPcZDCJ0q6GXHC09mPS0SvXHa8I1hRYR5JodtDQKJ/ACbg62eP2FKx+4m6U/QZni5UWTqQNV5BzoL+A7SDTnnp8vxO+qZH5sJDl7GX2NP15l9jkX1E8zPdygKlmCPyMCFRJKxC1ubeSU1CuqKqMz8dfh6+lXk3dq2bkj9vn1QxEhDuMkyTsBQcLhOjVre+mTqjjRMSPKhQAg37P03AjpKLnDyd7uHf3WOwGe3hTyD2tPmXOULXyDEIEnaGL26wveOApH0ddwB92cXsUOzGu2Qfjk1jfh6lZ1Iwham+ICjlLvVnrp95nS+nhtHQHIfnxvH1EYd7Ka3IJS226FlDF/Wy2yN7UGKdG+iWmiRO2m1awrIFpPCi/0jZzQUTBvOZcBgXJEKOA57x0awn8CIn6hIGCr5lY0YCaUF4JENw6FONr0Zyzi1gAQSJpooB83xC9f8nXn8S9eOUBihTey3DFWebFxPuG0rWeaMkXk9FZBmZM4lZY1KnV7BSqzaaXEOic9jKw90zqZVPKttrzvMUr8nNUqhDhpUTVM14azFJZJAbhsb3oSEm4t6tS0N6iFKhckUx7n5J9YXEebfCyoDZOcWslXpiwmnPEVRyaiTlcfycZKOIFkakxj+1sUtII8WG2mCINJLaCI3mL7aEW3IEJpSbT4dFRQV/SMOKONPuj5fncSsF+3O2Q2rdAGZIaajHoC6ZzHtho51Yge0wl16HuI7XxwkhEw72KjAaoCrHn4CrLJ0oR2zQYbIuV9UeNQ78bot2ZCmFJKGa5LlIm1woeRzNsqAo9H+H6+4TNHUO037ubYb8srgTY1jyvXwQmv3+S0k5cEef00IMit0IA2GFxkoXm17tAmkulpK464dw/DQZ+Wl58HKCNZedmEg+9oa7C8iLvdltmfPWvyIhq54UuCZ+Q4Lab0kfJWmv9tIgg4HTK0cdqbqqTY/DbExjQ/vclJIqAqP4rDOKg3nwHJpDFPtRJfQJN015ymz8EG6Y/I/pNFJtHjd71A6fBp0UH1VPmtGIsk6jKMe64VjUQcAz6OH8Unv5czhSKRvl/LCAbpdjtQ1n+1B5n6siUTyYV3+oexDyNckI9tP0PZDWlxkQ2COpGaPSTC3UUebMmy89t0rS3pREU9CkiXzdTtW2iqVUdnaVzUbbLw1KsJs8yCrSBmae3JeFa5J4lJBC+XmGDGw5I3UqnrOY87m9ksx5N30W3MICMkkK548SU/niARTTd1tVl4bJbx0us1F4Y9sjiLCLDY6f0siYX8jacok/JWtMw0kdgSGw7tBN3fMpeIpGfAU4VEgYpLnRiM+R6I6I1+8S3ciObmqIXYGEtAvOEXAvGzZ/qknP1W/Xtbzyz7RMR/WlBkQodsqU9/GfHogfHRS1NEYos4nx3vB8vLRxulwmEmLGoDrb0qytBXuvOVT9Gep6ICO28nF+NQ1iR4PI53rG2HRRRUwYmCPymm996iPfR3PR4sfFLO6HETsdC9rFfsChg3+l2vDp9sMU4jsx9tvtpz5t0NsHLV7qXSTyjZ0zl+tE/iHG0JtyXJBGDZMCeYEgVJV/kBPKOR2cjGluizgYJPuPcOgJzzN+FYh2Hcq+lFAJgM4XBclVCbUZEXh+9YGR//MWLSQCdikMzZZchWo/u6xFIeJMMaJLveGgfD1MunjaCfbTty0SmLdcIG3SGAToIGyo5ExiNgQ9W/1JmbfX7z7X5jyPxYaYyYtbbpLA75UZo9iVYRgezeWGnBmhBX5gAzblyIjNUFEB3TVr3gr5CqCIVfWJITQ3Mwt69zwaKOxY3xhaPgnVh7XlZd3mXRcJHNO8YuSWFKM/eYxBtJqwKszq4S1LlYGS8m61rEfcA0YarLWYU7LloDICV7Lv6nGDFxihpxSPQfeDNvUMRbfA9B3tai4fUF1yNeoQ0nY/FCrkM3yBufCAVCxFtH9tU3uxA6XmQUODpjA1HXdQ2CLxxXBIACl332XsVNl9FkRa1HapPngbfXFGqgUWEnnp0PJY1GJKgwqIPLCknZYJQS2kPaW5SSBL/AAUWnPaVQIVDw3XjNhN4wXCNT5YdkrBto4m3GNZkE3ZeENFoOT7v04dHmOj+jP2OClnjZTugMyayVAknTUXjyxle68SwoYqyANyIRrGEF0T0hPI/zTxQLv/OVD3lI7zHyiAJYeDC6dTXeAfmu56Nt03UH8sEBnFmODSZxO2m5bx5Gi4vgRD4cylor00mPydIzNFTadlRiV9AGqMhOihSZJIJoL2rou7etpBz3hse9mbpHTbBLnvNBx6Aa4z1kTmZ1xx4+qY5fUM/GsVfdQjuKvV7yfRaSIdpKPh/XS5nJB0vxjOQAiGHXgrV5fgF/pfcyCRTtqD3nAD6NTgyeMAqEsM53fbvOckLXK7kJc9+MHwHCL9GXJMCmfNRWRvC6kq1EqwoGsT6azxkcowstGeuEM/lK2LmAHvrjO6fBhq2In0KP+Y1UAPdQ8FtZuSAmOoIaz0CYzi26mZQEW7sc5mIbvdeXYUT5jbDmVg/7/0muNqmKg6S6qCePRfh50LbvszAnkqF72kdK7sh/4dpgSt6V4FSXhIjIhGWovSBJfjs1cKbhlgTc0wtv8h2/Y//JZEe7sLrtL4J1TpUhEor5BKfAyEYctV1HdJWrEhoXoGdaklwTtRcyFSewJvZtlXuGDydiyfWWsvhjXPQWy/b0DO4Nm0/LHsukJS+R9c0nhLHwpW4d1BEvQBSoAq6Cr04o1lqaq3vQULkEHaQToRa+eX2QV1fHBFhJhPbSiGJjaLBs7zzC4Dy/0BAnM70sg7msJdnnotE08aCEZgaMvN4Q7NFfD4XEAH1RNscjPV5ijs6W7TxtsvLNaXsI1Bslu+trAV/K0NFl9U5B/uzCqyGoAURBHmGZ4D2NnWDbezcWlpMQHYM2AEG1FxcWtEsNWm+c6j9ljdptXal00Bw2eJfT865B2oSm67RoYHAwy7qAtEhkmCZ/bHZ6yCY7oRpr30emq5zZvd3skiYwjpfcQC3Ut3kidYJl6Eg303kqjrV8LK/fzyoo/EZzpzF+jD3yY1FocTKgoFipb6Gx0pBbmxuwxVFul+VSRuKOt7XRuv4VASFgr+lMUptb0wzdZE+YNZ2T6/nVVe6i8+/EI7OlpwUEmubFvpJMONmzv1LgT5qlwOHKvkiOIgVPnBMf1iKQvXjMIxsbs49tufRmixkzqmPZQd4AFARmtRX2Itc1cd37XG5XtV0UEGHqw1ZTFW/w6YLRL4bholMqbd/pgOqujuJo4tU4V90BeeH5TCxb8YO8uaQRwUReXYtZNiEEXuJJ3pP1gLb7DkeJI28FZVwv3oaT7zNM4uD9kD9gJWlQPHKk3DEywqfAQnjHnbZmQCJRXJJbrCpoDvd5FUdc6n484A1N8+2vUQkFjvRM/+2jsnZ1FtpUOame/Y9oSDYOAjgzrxaoZLNavbRIhPif16dPUpLGzuWbiGFOrYXxE2yWHTGzBEb4PoRnEATKlwFTneLGivwjo+9LAauGHi7G64QkMYyws7YwZS9yW3oB2LgFen4/EvdbXFsDViZziwXD7Yw/UMhyvMIBICIzx1PiVHkvavwOkxo5acbm/z1QrJCd/Kzoq6L4epoFzBfcmgMN16PlxUyZQy1dpEtZcdQd/qAw3GqTuvcmiigqaPSQ5IKenEuhHgcJRi9qavJHMdlM0Zs0fW4zA2+YPei9089g2/Mdf6j1O99dhOCm6i2gIiSjzNpT/HEKWMzjZqFTH6VFyBpQ/mH1EmSI69xBs/drMs1fGtKyO1xI9vTh04sRd5M12xxeU++a8iUQaA8x44oKVrpBvB8DoBr56JZPQBbwhS47aHuWz/388Mp95aAV+5CYzoCHZ7fcf9DMP7IHGdTPrfVYWMkGDyNsmsFr/7vNQQVeNrNEt2JrwUNYCwWo/XfVh3myKfj3G1iTdaw8b/9CQJ/2vvH/EVTG6W6omvMNQzq69PPNfC+Cdf+Wg2FKHWBd8qBI1tEqs9Kp2EnhxfkZK+GgxpMf3Dcwjj3NQ86rXqela7bfwgdrMxW2ncw/oqVx0rzBXyazcyCou3vpnc8oZDi9x1hL9EYeRilnxsNI0I0ikXqv/XU7gQmGc1flwog+aK/oTkb2sosyf6/mPqzQ7zzq/Fhdt0VlVp719BMMV1JdXFE+qz5Rl6br9b5EEU78DyBBSo948HL1QVyvfO5A+Y6KlINSXFNiGcVFj8WaAF6l61gc3CTtsuQULu5KQEcXU4jeR0Gg9Lljz9Wq+PCGvjR9BZ3RTzRKoYjfs+5F1l+Rk7B32X23ZxZJdAFf+wpapAgEM2uUDCNHFmDyN5hFQwjIgThCFhmH8y43lJFuMV+p4hfGTyX4E5O4DS625Rl4SxowgjFvzKCT4dC/6Vp5cIW0dY6OVXZzgETrj1j+TL6IKl/Ctip295oilmeJlNXpZ6js0X2ZnnfTAMM56L0fCLtsPPXgSSAx82YQrdDpShjY7CxdPd3pYYbNIpAgotxEVB2icwDMRUwVsuBYEB9OprDIeFNt6VeZcIGJe9tJkQK+QInBSakYNFyHMKJNWHuBEmeSTo8leV09BuF0AMWNqZbKe+viO4I5a5U4uxPPjklmnIlwoTl2GjVR+QNUxI1ZqPqBs0OWhPCRpvi+VshkKxBpfescjxBimms1fG1rXw8a6sZmw2h/WNVlpdTK2HX1UI8eB3Qq3Bb5xF+2Gt3TjwjmY0akezTqseSYYK6S2ieOq+C9qEn0f+Jj2kM3BwIm0MgQeC+UW3vBJgLo6fqmgSkJiw8voU+zvqT6v/7PfsdqT0Yn5PBWZYo/gMrECK4SKm3FuCu1XminVXFcR+733KLwLmoY06CaaDx53Bnt0ssAp4ERbzUb3cXXm8as3MkLD3kLdoC/NAycc2xzaZuob4lFlvYO2K8YOExVX50ZkzO3kiFjHQstyuFyeD6qU9IWuBI1V7ET9X/mZ6RCEO/Zm/++cH8wXMWBRAp8T8E5p+yi2EipfA7LUfEPMc7eNlqvf/NW5cCOXuvP6NRGmmgtsPpOpULqF+If1we73zsmIAhwbzIrypr/YhwNU1F8EOIA1WBwA3/Fqm6EDeKOlRkt36RnHInZslTB3UvKT9fvmCT+Bfi7p47x3jjapd8EFcKBiUEKqMFFfYtVgpkDjcDBWfwCwB5fVmDb+7HUY1484VdJ0GMPpZTLvqFUdzmLHIcoOukXbW6HhWprP5YeTKALzCphJpQq0YiVpHSfS6Nik7eh+YXh4S0KSDBBXuoNR0zfa1Y/kpp44IiAFzqXmBSxvBik5HcU+X9+ow10lPOet2LknL+woBDoLyk6f/kMHMLTM6ZWxCXiDQGKXgw/cAmR22sAGZ2qXbaphB2Cp9ckCeVeJPxF5dS6OUcs9dz9EU/kkEdpUVYoUIlQuLiRmLb9aLT2RjsSzqYADqqH6Sb0F4kISO9qXfYKaxRBWPPcoMFMuvmNdk6++StNkB5PTWs+ERpHulCRKf+U0=', '__EVENTTARGET': 'MoreInfoList1$Pager', '__EVENTARGUMENT': page } url = 'http://ncztb.nc.gov.cn/nczbw/jyxx/{}/MoreInfo.aspx'.format(categoryId) response = requests.post(url=url, headers=self.headers, params=params, data=data, cookies=self.cookies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) # time.sleep(3) # self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li') url_li = selector.xpath('//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href') # for div_ele in div_ele_li: for url in url_li: # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') urls = 'http://ncztb.nc.gov.cn' + url # self.load_get_html(urls) if not self.rq.in_rset(urls): self.rq.add_to_rset(urls) self.rq.pull_to_rlist(urls) def init(self): count = 2 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # print(os.getppid()) threading.Thread(target=self.init).start() flag = 2 task_li = [ {'categoryId':'002001/002001002', 'types':'002001002','all_page': flag}, {'categoryId':'002001/002001004', 'types':'002001004','all_page': flag}, {'categoryId':'002001/002001005', 'types':'002001005','all_page': flag}, {'categoryId':'002002/002002002', 'types':'002002002','all_page': flag}, {'categoryId':'002002/002002005', 'types':'002002005','all_page': flag}, {'categoryId':'002003/002003001', 'types':'002003001','all_page': flag}, {'categoryId':'002003/002003004', 'types':'002003004','all_page': flag}, {'categoryId':'002009/002009001', 'types':'002009001','all_page': flag}, {'categoryId':'002009/002009004', 'types':'002009004','all_page': flag}, {'categoryId':'002004/002004001', 'types':'002004001','all_page': flag}, {'categoryId':'002004/002004002', 'types':'002004002','all_page': flag}, {'categoryId':'002004/002004003', 'types':'002004003','all_page': flag}, {'categoryId':'002004/002004004', 'types':'002004004','all_page': flag}, {'categoryId':'002004/002004005', 'types':'002004005','all_page': flag}, {'categoryId':'002005/002005002', 'types':'002005002','all_page': flag}, {'categoryId':'002010/002010001', 'types':'002010001','all_page': flag}, {'categoryId':'002010/002010002', 'types':'002010002','all_page': flag}, {'categoryId':'002010/002010004', 'types':'002010004','all_page': flag}, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: categoryId = task['categoryId'] types = task['types'] # self.load_get(categoryId, page) spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 10: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''江西政府采购网''' def __init__(self): name = 'jiangxi_ccgp-jiangxi_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://ccgp-jiangxi.gov.cn/web/jyxx/002006/jyxx.html', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='jiangxi_list1', dbset='jiangxi_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self,data_dict): # {'categorynum': '002006005', # 'infoid': '99c03675-a099-412e-b97b-7d45ee9c3872', # 'postdate': '2018-06-08', # 'title': '[省本级]江西科技师范大学工程造价软件升级更新项目单一来源采购征求意见公示'} try: publish_date = data_dict['postdate'] url = 'http://ccgp-jiangxi.gov.cn/web/jyxx/002006/'+ data_dict['categorynum']+'/'+ ''.join(publish_date.split('-'))+'/'+ data_dict['infoid'] + '.html' # print(url) response = requests.get(url=url, headers=self.headers) if response.status_code ==404: return selector = etree.HTML(response.text) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = data_dict['title'] try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) # print(publish_date) area_name = '江西' source = 'http://ccgp-jiangxi.gov.cn/' table = selector.xpath('//div[@class="ewb-detail-box"]')[0] content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '江西省政府采购网' retult_dict['en_name'] = 'Jiangxi Province Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, page): try: params = ( ('response', 'application/json'), ('pageIndex', page), ('pageSize', '22'), ('area', ''), ('prepostDate', ''), ('nxtpostDate', ''), ('xxTitle', ''), ('categorynum', '002006'), ) url = 'http://ccgp-jiangxi.gov.cn/jxzfcg/services/JyxxWebservice/getList' response = requests.get(url=url, headers=self.headers,params=params).json() except: print('load_post error') self.load_get(page) else: print('第{}页'.format(page)) # print(response) response_li = eval(response['return'])['Table'] for data_dict in response_li: self.load_get_html(data_dict) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ # {'all_page': 3156}, {'all_page': 3}, ] count = 1 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # self.load_get(page) spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()
class GovBuy(object): '''苏州政府采购网''' def __init__(self): name = 'suzhou_zfcg_suzhou_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Origin': 'http://www.zfcg.suzhou.gov.cn', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://www.zfcg.suzhou.gov.cn/html/search.shtml?title=&choose=&projectType=0&zbCode=&appcode=', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='suzhou_list1', dbset='suzhou_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self,result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self,pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >=2 and area_li[1] !='': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self,pid): if pid == None: return try: url = 'http://www.zfcg.suzhou.gov.cn/html/project/'+ pid +'.shtml' response = requests.get(url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector.xpath('//div[@class="M_title"]/text()') if title != []: title = re.sub(r'\r|\n|\s','',title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="date"]/span/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = '江苏-苏州' # print(area_name) source = 'http://www.zfcg.suzhou.gov.cn/' table_ele = selector.xpath('//div[@id="tab1"]')[0] content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '苏州市政府采购网' retult_dict['en_name'] = 'Suzhou City Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, types, page): try: data = [ ('title', ''), ('choose', ''), ('type', types), ('zbCode', ''), ('appcode', ''), ('page', page), ('rows', '30'), ] url = 'http://www.zfcg.suzhou.gov.cn/content/searchContents.action' response = requests.post(url=url, headers=self.headers, data=data).json() # selector = etree.HTML(response) except: print('load_post error') self.load_get(types, page) else: print('第{}页'.format(page)) # print(response) response_li = response['rows'] if response_li == []: return for project_id in response_li: pid = project_id['PROJECTID'] # self.load_get_html(pid) if not self.rq.in_rset(pid): self.rq.add_to_rset(pid) self.rq.pull_to_rlist(pid) def init(self): count = 3 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)] gevent.joinall(spawns) except Exception as e: print(e) def run(self): threading.Thread(target=self.init).start() task_li = [ {'type':'0', 'all_page': 2}, {'type':'1', 'all_page': 2}, {'type':'2', 'all_page': 2}, ] count = 3 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: types = task['type'] # self.load_get(base_url, page) spawns = [gevent.spawn(self.load_get,types, page + i) for i in range(count)] gevent.joinall(spawns) # print('第{}页'.format(page)) except Exception as e: print(e) if self.rq.r_len() > 0: threading.Thread(target=self.init).start() def main(self): self.run()
class GovBuy(object): '''武汉政府采购网''' def __init__(self): name = 'wuhan_cgb_wuhan_gov_cn' self.coll = StorageSetting(name) self.collection = self.coll.find_collection self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://cgb.wuhan.gov.cn/notice/zbgg//index_2.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-CN;q=0.9', } self.session = requests.session() self.rq = Rdis_Queue(host='localhost', dblist='wuhan_list1', dbset='wuhan_set1') def is_running(self): is_runing = True if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0: return False else: return is_runing def hash_to_md5(self, sign_str): m = hashlib.md5() sign_str = sign_str.encode('utf-8') m.update(sign_str) sign = m.hexdigest() return sign def now_time(self): time_stamp = datetime.datetime.now() return time_stamp.strftime('%Y-%m-%d %H:%M:%S') def save_to_mongo(self, result_dic): self.coll.saves(result_dic) self.is_running() def get_area(self, pro, strs): location_str = [strs] try: df = transform(location_str, umap={}) area_str = re.sub( r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df)))) except: pass else: if area_str == '': area_li = [pro] else: area_li = (area_str.split('-')) if len(area_li) >= 2 and area_li[1] != '': return '-'.join(area_li[:2]) else: return area_li[0] def load_get_html(self, li): try: selector_li = etree.HTML(str(li)) url = 'http://cgb.wuhan.gov.cn' + selector_li.xpath( '//li/a/@href')[0] response = requests.get( url=url, headers=self.headers).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector_li.xpath('//li/a/text()') if title != []: title = title[0] else: title = None # print(title) status = selector_li.xpath('//li/div/span[3]/font/text()') if status != []: status = str(status[0]) else: status = None _id = self.hash_to_md5(url) publish_date_li = selector_li.xpath('//li/span/text()') if publish_date_li != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)', ''.join(publish_date_li)).group() else: publish_date = None # print(publish_date) # area_name = self.get_area('武汉', ''.join(publish_date_li)) area_name = '武汉' source = 'http://cgb.wuhan.gov.cn/' soup = BeautifulSoup(response) content_html = soup.find(class_='art_con') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '武汉政府采购网' retult_dict['en_name'] = 'Wuhan Government Procurement' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict) def load_get(self, url): try: response = requests.post( url=url, headers=self.headers).content.decode('utf-8') print(response) soup = BeautifulSoup(response) except: print('load_post error') self.load_get(url) else: ul = soup.find(class_="news-list-content list-unstyled") ul_li = ul.find_all('li') for li in ul_li: self.load_get_html(li) # if not self.rq.in_rset(urls): # self.rq.add_to_rset(urls) # self.rq.pull_to_rlist(urls) def init(self): count = 8 while self.is_running(): if self.rq.r_len() <= count: count = 1 try: spawns = [ gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count) ] gevent.joinall(spawns) except Exception as e: print(e) def run(self): # threading.Thread(target=self.init).start() task_li = [ { 'url': 'http://cgb.wuhan.gov.cn/notice/zbgg//index_', 'all_page': 3 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/cggg/index_', 'all_page': 3 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/gzgg/index_', 'all_page': 3 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/fbgg/index_', 'all_page': 3 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/dylygg/index_', 'all_page': 2 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/qtgg/index_', 'all_page': 2 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/jkcpgg/index_', 'all_page': 1 }, { 'url': 'http://cgb.wuhan.gov.cn/notice/dzscgg/index_', 'all_page': 2 }, { 'url': 'http://cgb.wuhan.gov.cn/contract/index_', 'all_page': 2 }, ] count = 3 for task in task_li: for page in range(1, task['all_page'] + 1, count): try: # url =task['url']+str(page)+'.html' # self.load_get(url) spawns = [ gevent.spawn(self.load_get, task['url'] + str(page + i) + '.html') for i in range(count) ] gevent.joinall(spawns) print('第{}页'.format(page)) except Exception as e: print(e) def main(self): self.run()