def fix_javascript(url, content): """ 中南文交所的幺蛾子 假定至少安装过node """ import execjs try: if 'znypjy' in url: text = content.decode('gb18030', 'ignore') m = re.compile('(function.*?;})window.location').search(text) if m: script = m.group(1) code = execjs.compile(script).call('decoder') content = session.get( url + '?' + code, timeout=(5, 10)).content elif 'xhcae' in url: text = content.decode('gb18030', 'ignore') m = re.compile( '/notice/\w+/\?WebShieldSessionVerify=\w+').search(text) if m: url = m.group(0) content = session.get( 'http://www.xhcae.com' + url, timeout=(5, 10)).content except: log.exception('') return content
def fix_javascript(url, content): """ 中南文交所的幺蛾子 假定至少安装过node """ import execjs try: if 'znypjy' in url: text = content.decode('gb18030', 'ignore') m = re.compile('(function.*?;})window.location').search(text) if m: script = m.group(1) code = execjs.compile(script).call('decoder') content = session.get(url + '?' + code, timeout=(5, 10)).content elif 'xhcae' in url: text = content.decode('gb18030', 'ignore') m = re.compile('/notice/\w+/\?WebShieldSessionVerify=\w+').search( text) if m: url = m.group(0) content = session.get('http://www.xhcae.com' + url, timeout=(5, 10)).content except: log.exception('') return content
def crawl_all(): for site in SITES: retries = 5 while retries > 0: retries -= 1 try: crawl(site, maxpage=1) except: log.exception('站点{}爬取失败, retries={}'.format(site, retries)) else: break
def crawl_all(): for site in SITES: retries = 5 while retries > 0: retries -= 1 try: crawl(site, maxpage=1) except: log.exception('站点{}爬取失败, retries={}'.format(site, retries)) else: break
def __exit__(self, type, value, traceback): if value: crawl_log.exception('出错啦') path.unlink() return True