def job_gdeegd(): # 文章列表页 urls = [] for i in range(1, 64): if (i == 1): url = "http://gdee.gd.gov.cn/ggtz3126/index.html" else: url = "http://gdee.gd.gov.cn/ggtz3126/index_%s.html" % str(i) urls.append(url) # 文章详情页 for url in urls: t1 = time.time() headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'http://gdee.gd.gov.cn/ggtz3126/index_3.html', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie': 'm_bt=yes; openstack_cookie_insert=62355311; _gscu_1815356153=89127015q6kzl720; _gscbrs_1815356153=1; UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; CNZZDATA3588456=cnzz_eid%3D214537553-1589123201-http%253A%252F%252Ftest.gzjirui.com%252F%26ntime%3D1589123201; _gscs_1815356153=89127015ev2u6d20|pv:2' } response = requests.request("GET", url, headers=headers) selector = Selector(text=response.text) urls = selector.xpath( "/html/body/div/div[3]/div[2]/div/div[2]/ul/li[3]/div/a/@href" ).extract() for u in urls: if not FILTER.isContains(u): # 如果还没有爬过 FILTER.insert(u) # 标志为已爬 seed = Seed(url=u, downloader='gdeegd.crawl0') push_seed(seed) # seed = Seed(url=u, downloader='gdeegd.crawl0') # push_seed(seed) writeLog('Finish add the seeds of gdeegd (Used: %s)' % (time.time() - t1))
def failure_dog(): while True: failureNames = RCONN.keys('Failure:*') for one in failureNames: length = RCONN.llen(one) length_drop = 0 for i in range(length): txt = RCONN.rpop(one) js = eval(txt) if 'failureCount' in js.keys() and js['failureCount'] < 4: RCONN.lpush(settings.REDIS_KEYNAME, txt) else: length_drop += 1 writeLog('Roll Failure_%s back to Seeds: %s (drop %s)' % (length, one, length_drop)) sendMsg('Roll Failure_%s back to Seeds: %s (drop %s)' % (length, one, length_drop)) time.sleep(3600)
def job_scjgjjs(): # 文章列表页 urls = [] for i in range(1, 21): if (i == 1): url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html" else: url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html?uid=277431&pageNum=%s" % str( i) urls.append(url) # 文章详情页 for url in urls: t1 = time.time() headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'http://test.gzjirui.com/magicflu/html/form/records2.jsp?spaceId=02393294-327d-43ed-835e-d8fe778772a8&formId=-1', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie': '__jsluid_h=8011b3a4cb561d1de121a1fa390ab4df; _gscu_1226760719=8861650310idmp17; _gscbrs_1226760719=1; yunsuo_session_verify=75a060942bec9e14902b3b5453719ad1; _gscs_1226760719=t89123468mg3q2f70|pv:3' } response = requests.request("GET", url, headers=headers) selector = Selector(text=response.text) urls = selector.xpath('//*[@id="277431"]/div/li[1]/a/@href').extract() for u in urls: if not FILTER.isContains(u): # 如果还没有爬过 FILTER.insert(u) # 标志为已爬 seed = Seed(url=u, downloader='gdstc.crawl0') push_seed(seed) # seed = Seed(url=u, downloader='gdstc.crawl0') # push_seed(seed) writeLog('Finish add the seeds of gdstc (Used: %s)' % (time.time() - t1))
def job_comgdgov(): # 文章列表页 urls = [] for i in range(1, 16): if (i == 1): url = "http://com.gd.gov.cn/zwgk/gggs/index.html" else: url = "http://com.gd.gov.cn/zwgk/gggs/index_%s.html" % str(i) urls.append(url) # 文章详情页 for url in urls: t1 = time.time() headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'http://com.gd.gov.cn/zwgk/gggs/index_16.html', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie': 'UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; openstack_cookie_insert=81202878' } response = requests.request("GET", url, headers=headers) selector = Selector(text=response.text) urls = selector.xpath( "/html/body/div[2]/div/div[2]/ul/li[4]/a/@href").extract() for u in urls: if not FILTER.isContains(u): # 如果还没有爬过 FILTER.insert(u) # 标志为已爬 seed = Seed(url=u, downloader='comgdgov.crawl0') push_seed(seed) # seed = Seed(url=u, downloader='comgdgov.crawl0') # push_seed(seed) writeLog('Finish add the seeds of comgdgov (Used: %s)' % (time.time() - t1))
def proxies_dog(): while True: for plan in settings.PLANS: if 'isProxy' in plan.keys() and plan['isProxy'] is True: if RCONN.llen('Proxies:%s' % plan['spider']) <= 3: writeLog('Proxies:%s need IP ... ' % plan['spider']) try: RCONN.delete('Proxies:%s' % plan['spider']) except Exception as e: pass failure = 0 num = 0 while failure < 3: try: r = requests.get( 'http://dps.kuaidaili.com/api/getdps/?orderid=969999783818434&num=200&sep=2', timeout=10) if r.status_code == 200: txts = r.content.split('\n') for txt in txts: js = {'http': 'http://%s' % txt, 'https': 'http://%s' % txt} RCONN.lpush('Proxies:%s' % plan['spider'], str(js)) num += 1 break except Exception as e: failure += 1 writeLog('Successful Proxies:%s (%s)' % (plan['spider'], num)) writeLog('proxies_dog sleeping...') time.sleep(60)
def job_gdstc(): # 文章列表页 urls = [] for i in range(1, 21): if (i == 1): url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index.html" else: url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index_%s.html" % str(i) urls.append(url) # 文章详情页 for url in urls: t1 = time.time() headers = { 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9", 'Upgrade-Insecure-Requests': "1", 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'Referer': "http://gdstc.gd.gov.cn/zwgk_n/", 'Cookie': "zh_choose=s; zh_choose=s; openstack_cookie_insert=76667651", 'Connection': "keep-alive" } response = requests.request("GET", url, headers=headers) selector = Selector(text=response.text) urls = selector.xpath( "/html/body/div[2]/div[2]/div[2]/ul/li/a/@href").extract() for u in urls: if not FILTER.isContains(u): # 如果还没有爬过 FILTER.insert(u) # 标志为已爬 seed = Seed(url=u, downloader='gdstc.crawl0') push_seed(seed) # seed = Seed(url=u, downloader='gdstc.crawl0') # push_seed(seed) writeLog('Finish add the seeds of gdstc (Used: %s)' % (time.time() - t1))
def job_bbsp2peye0(): start, end = readTime(spiderName='bbsp2peye0') t1 = time.time() seeds = { 'http://news.p2peye.com/ptdt/': '平台动态', 'http://news.p2peye.com/wdzl/': '网贷专栏', 'http://news.p2peye.com/wdxw/': '网贷新闻', 'http://www.p2peye.com/forum-60-1.html': '曝光台', } for seed in seeds.keys(): try: data = { 'url': seed, 'spider': 'Spider_bbsp2peye', 'category': seeds[seed], 'start': start, 'end': end, } push_seed(data) except Exception as e: writeLog(str(e)) writeLog('Finish add the seeds of bbsp2peye0 (Used: %s)' % (time.time() - t1))
def tasks_dog(): """ 任务调度时间主要有七种:weekday、day、daystep、hour、hourstep、minute、minutestep。 weekday、day、hour、minute三种类型:整型、list、tuple。(hour=3表示每天3点执行,hour=[3,4,5]表示每天3,4,5点都执行一次,hour=(3,6)表示每天3~6点每个小时都执行一次)。 daystep、hourstep、minutestep为整型。(hourstep=3表示每个3个小时执行一次)。 """ start = datetime.datetime.now() while True: now = datetime.datetime.now() weekday = now.weekday() + 1 day = now.day hour = now.hour minute = now.minute tasks = [] for plan in settings.PLANS: if 'times' not in plan.keys(): plan['times'] = 0 if 'weekday' in plan.keys(): # 以下判断时间是否不符合,不符合要求时continue跳过。 if isContinue(plan['weekday'], weekday) == 1: continue if 'day' in plan.keys(): if isContinue(plan['day'], day) == 1: continue if 'daystep' in plan.keys(): if isinstance(plan['daystep'], int): if plan['daystep'] == 0 or (now - start).days % plan['daystep'] != 0: continue if ( 'weekday' in plan.keys() or 'day' in plan.keys() or 'daystep' in plan.keys()) and 'hour' not in plan.keys() and 'hourstep' not in plan.keys(): plan['hour'] = start.hour if 'hour' in plan.keys(): if isContinue(plan['hour'], hour) == 1: continue if 'hourstep' in plan.keys(): if isinstance(plan['hourstep'], int): difference = (now - start).seconds / 3600 if plan['hourstep'] == 0 or difference % plan['hourstep'] != 0 or difference / plan['hourstep'] != \ plan['times']: continue if ( 'hour' in plan.keys() or 'hourstep' in plan.keys()) and 'minute' not in plan.keys() and 'minutestep' not in plan.keys(): plan['minute'] = start.minute if 'minute' in plan.keys(): if isContinue(plan['minute'], minute) == 1: continue if 'minutestep' in plan.keys(): if isinstance(plan['minutestep'], int): difference = int((now - start).total_seconds()) / 60 # 过去了多少分钟 if difference / plan['minutestep'] > plan['times']: plan['times'] = difference / plan['minutestep'] if plan['minutestep'] == 0 or difference % plan['minutestep'] != 0 or difference / plan[ 'minutestep'] != plan['times']: continue plan['times'] += 1 tasks.append(getattr(jobs, plan['name'])) writeLog("Total task to run : {}".format(len(tasks))) for task in tasks: try: task() except Exception as e: """执行任务异常""" msg = traceback.format_exc() # 方式1 writeLog("run task exception: %s" % msg) sleep_time = 60 - datetime.datetime.now().second writeLog('tasks_dog sleeping...{}'.format(sleep_time)) time.sleep(sleep_time)