def test(): print('当前时间:%s'%datetime.datetime.now().strftime("%Y-%m-%d %X")) print('8888888888888888888888') #任务入口 def entrance(): #初始化任务队列 global options,q options = configutil.getoptions('urls') flag = True endTime = '10:00:00' i = 0 while flag: i += 1 for opt in options: q.put(opt) logger.info('-----------------第%s次爬取开始,当前队列中还有%s个网站需要处理-----------------'%(i,q.qsize())) open_thread() nowTime = datetime.now().strftime('%X') if endTime < nowTime or options.__len__() == 0: flag = False logger.info('================================今日任务已完成,爬虫进入休眠状态================================') else: time.sleep(1800) if __name__ == '__main__': logger.info('================================日报爬虫开始工作================================') entrance()
var n = Math.floor(Math.random() * 16.0).toString(16); guid += n; // if ((i == 8) || (i == 12) || (i == 16) || (i == 20)) guid += // "-"; } return guid; }""" ctx = execjs.compile(js) pageid = ctx.call("happy") return pageid if __name__ == '__main__': servers = ['47.111.24.165:5000', '47.94.209.31:5000', '47.105.61.16:5000'] ser = random.choice(servers) logger.info('本次工作ip : %s' % ser) logger.info('=========================开始抓取政府网站案件=========================') logger.info('~~~~~~~~~~~~~~~~~~~证券部分~~~~~~~~~~~~~~~') zq = getconfig('pjws', 'address1').format(get_pageid()) result = grab(zq, ser) for i in result: logger.info('%s、%s' % (result.index(i) + 1, i)) es_operate(result) time.sleep(random.randint(100, 300)) logger.info( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) logger.info( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) logger.info(
def detail_page(self, docid): """文书详情页""" url = "http://wenshu.court.gov.cn/website/parse/rest.q4w" data = { "docId": "%s" % docid, "ciphertext": get_cipher(), "cfg": "com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch", "__RequestVerificationToken": "%s" % get_token(), } response = self.session.post(url, data=data, headers=self.headers) json_value = json.loads(response.text) secretKey = json_value["secretKey"] result = json_value["result"] data = json.loads( get_result(result, secretKey, time.strftime("%Y%m%d"))) print(data) if __name__ == '__main__': demo = wenshu() logger.info('==================开始抓取证券部分==================') demo.get_docid('%E9%93%B6%E8%A1%8C') time.sleep(random.randint(30, 120)) logger.info('++++++++++++++++++开始抓取银行部分++++++++++++++++++') demo.get_docid('银行') time.sleep(random.randint(30, 120)) logger.info('~~~~~~~~~~~~~~~~~~开始抓取信托部分~~~~~~~~~~~~~~~~~~') demo.get_docid('信托')
import hashlib import time from elasticsearch import Elasticsearch from util import configutil from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='esutil').getlog() try: host = configutil.getconfig('eshost', 'host') port = configutil.getconfig('eshost', 'port') es = Elasticsearch([{'host': host, 'port': port}]) except Exception as ex: logger.info(ex) def insert_single_data(index_name, doc_type, data, esid): try: res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid) return res except Exception as e: logger.info(e) def insert_datas(index_name, doc_type, datas): try: res = es.bulk(index=index_name, doc_type=doc_type, body=datas) return res except Exception as e: