def cookieurl_3(): url = configutil.getconfig('cookieurl', '3') page = cookie_Parse(url) soup = BeautifulSoup(page, 'html.parser') name = soup.title.text try: tbody = soup.find('td', attrs={'class': '2016_erji_content'}) for link in tbody.find_all('a'): href = link.get('href') real_path = parse.urljoin(url, href) title = link.get_text() urlMd5 = esutil.format_md5(real_path) if esutil.query_data('spidernews_index', 'spidernews_type', urlMd5): pass else: data = { 'link': real_path, 'name': name, 'createTime': int(round(time.time() * 1000)), 'title': title, 'urlMd5': urlMd5, } print(data) esutil.insert_single_data('spidernews_index', 'spidernews_type', data, urlMd5) except Exception as e: logger.info(e)
def cookieurl_1(): url = configutil.getconfig('cookieurl', '1') page = cookie_Parse(url) soup = BeautifulSoup(page, 'html.parser') name = soup.title.text try: tbody = soup.find('tbody', attrs={'id': 'contentBody'}) for link in tbody.find_all('a'): href = link.get('href') title = link.get_text() urlMd5 = esutil.format_md5(href) if esutil.query_data('spidernews_index', 'spidernews_type', urlMd5): pass else: data = { 'link': href, 'name': name, 'createTime': int(round(time.time() * 1000)), 'title': title, 'urlMd5': urlMd5, } esutil.insert_single_data('spidernews_index', 'spidernews_type', data, urlMd5) except Exception as e: logger.info(e)
def confirm(string): try: url = getconfig('companycheck', 'address') + string resp = requests.get(url) # print(resp.text) result = json.loads(resp.text) if result.get('code') == '2': return True else: return False except Exception as e: logger.info(e) return False
def formatUrl(): url = getconfig('urls','nfrb') date = time.strftime('%Y-%m/%d',time.localtime(time.time())) formatUrl = url.format(date) return formatUrl
def formatUrl(): url = getconfig('urls','hbrb') date = time.strftime('%Y%m%d',time.localtime(time.time())) formatUrl = url.format(date,'{}') return formatUrl
def formatUrl(): url = getconfig('urls', 'jxrb') date1 = time.strftime('%Y-%m/%d', time.localtime(time.time())) date2 = time.strftime('%Y-%m-%d', time.localtime(time.time())) formatUrl = url.format(date1, date2) return formatUrl
// "-"; } return guid; }""" ctx = execjs.compile(js) pageid = ctx.call("happy") return pageid if __name__ == '__main__': servers = ['47.111.24.165:5000', '47.94.209.31:5000', '47.105.61.16:5000'] ser = random.choice(servers) logger.info('本次工作ip : %s' % ser) logger.info('=========================开始抓取政府网站案件=========================') logger.info('~~~~~~~~~~~~~~~~~~~证券部分~~~~~~~~~~~~~~~') zq = getconfig('pjws', 'address1').format(get_pageid()) result = grab(zq, ser) for i in result: logger.info('%s、%s' % (result.index(i) + 1, i)) es_operate(result) time.sleep(random.randint(100, 300)) logger.info( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) logger.info( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) logger.info( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) logger.info(
def formatUrl(): url = getconfig('urls', 'dzszb') date1 = time.strftime('%Y%m%d', time.localtime(time.time())) return url.format(date1)
# -*- coding: utf-8 -*- import hashlib import time from elasticsearch import Elasticsearch from util import configutil from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='esutil').getlog() try: host = configutil.getconfig('eshost', 'host') port = configutil.getconfig('eshost', 'port') es = Elasticsearch([{'host': host, 'port': port}]) except Exception as ex: logger.info(ex) def insert_single_data(index_name, doc_type, data, esid): try: res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid) return res except Exception as e: logger.info(e) def insert_datas(index_name, doc_type, datas): try: res = es.bulk(index=index_name, doc_type=doc_type, body=datas) return res