def pre_save(self, saver): saver.add(GkChsiParser.title) s2 = LinkSaver('res_score_%s' % self.name, 'w') s2.add('省市,科类,层次,位次,分数') for r in self.score_rank: s2.add(r) s2.flush()
class ShanghaiStoreFilter(CWPParser): def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'shanghai_court') self.pagestore = ShanghaiCourtStore('sh_court_2') self.link_saver = LinkSaver('wrong.id.txt') def process_child_item(self, item): self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1]) def parse_item(self, page): if page['indexUrl'][17] != '/': return [page] self.link_saver.add(page['indexUrl'][17:]) return [] def on_finish(self): self.link_saver.flush() def on_save(self, items): for item in items: self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1])
class FileAbstractParser(CAPParser): def __init__(self, channel, name, saver_name=None, db='admin', url='mongodb://*****:*****@localhost/'): CAPParser.__init__(self, channel, name, db, url) if saver_name is None: self._save_name = 'out.csv' else: self._save_name = saver_name self.saver = None def init(self): self.saver = LinkSaver(self._save_name, 'w') self.pre_save(self.saver) return CAPParser.init(self) def parse(self, page): pass def pre_save(self, saver): pass def save(self, saver, page): pass def on_save(self, items): item_list = spider.util.unique_list(items) for item in item_list: self.save(self.saver, item) self.saver.flush()
class SeedParser(WenshuSpider): date_format = '%Y%m%d' def __init__(self, thcnt=4, page=15): WenshuSpider.__init__(self, thcnt) self.source = WenshuSeedDb('ws_seed') self.link_saver = LinkSaver('seeds.dat', buffer_size=400) self.page = page def dispatch(self): seeds = self.source.export_seeds() print 'load %d seeds' % len(seeds) for seed in seeds: date = seed['indexUrl'].split('://')[1] eval_str = seed['content'][1:-1].replace('\\"', '"') res = eval(eval_str) try: if (isinstance(res, tuple) or isinstance(res, list)) and len(res) > 0: self.add_main_job({ 'type': 'main', 'date': date.encode('utf-8'), 'count': int(res[0]['Count']) }) else: print 'invalid seed', seed except KeyError as e: Log.error('KeyError %s' % e.message) traceback.print_exc() print seed print eval_str time.sleep(2) self.wait_q() self.add_job(None) def run_job(self, jobid): pagecnt = (jobid['count'] + self.page / 2) / self.page for index in range(1, pagecnt + 1): self.link_saver.add( str({ 'date': jobid['date'], 'count': jobid['count'], 'index': index, 'page': self.page })) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.link_saver.flush()
def on_finish(self): FileAbstractParser.on_finish(self) unfetch_saver = LinkSaver('unfetched_seeds_detail_' + self.channel) self.unfetch_list = spider.util.unique_list(self.unfetch_list) self.fetched_list = spider.util.unique_list(self.fetched_list) unfetched = [] for link in self.unfetch_list: if link not in self.fetched_list: unfetched.append(link) self.unfetch_list = unfetched for link in self.unfetch_list: unfetch_saver.add(link) unfetch_saver.flush() fetchsaver = LinkSaver('fetched_seeds_detail_' + self.channel) for l in self.fetched_list: fetchsaver.add(str(l)) fetchsaver.flush() print 'fetched jobs', len(self.fetched_list) print 'unfetched jobs', len(self.unfetch_list)
class ShanghaiExtractor(CWPParser): """解析文书案号""" def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'court') self.an_saver = LinkSaver('ah.%s.txt' % self.name) def process_child_item(self, item): line = '%s|%s' % (item[0], item[1]) print line self.an_saver.add(line) def init(self): print 'job start at', datetime.datetime.now() return CWPParser.init(self) def parse_item(self, page): m = re.search('((\d{4}).*\d+号)', page['content'][1]) if m: return [[m.group(1), page['indexUrl'][17:].encode()]] return [] def on_finish(self): self.an_saver.flush()
class ChsiSpider(BaseGkChsiFsxSpider): def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5, ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None, job_tag='', spider_type='detail', post_kldms=True): super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = self.new_page_store(spider_type, tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag)) self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag)) self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag)) self.year = year self.failed_list = [] self.invalid_list = [] self.spider_type = spider_type self.post_kldms = post_kldms def dispatch(self): # read all seeds seeds = [] with open(self.seeds, 'r') as f: for l in f: data = self.parse_seed(l.strip()) if not data: continue if self.year == str(data['years']): if not self.recover or not self.pagestore.find_any( self.pagestore.channel + '://' + self.get_job_id(data)): seeds.append(data) print 'load ', len(seeds), 'jobs' count = 10 while len(seeds) > 0 and count > 0: count -= 1 logging.info('remain tries %d', count) for kldm in self.kldms: for bkcc in self.bkccs: seeds = self.request_list(seeds, kldm, bkcc) logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s,tries=%d', len(seeds), len(self.failed_list), kldm, bkcc, count) time.sleep(2) self.wait_q() seeds += self.failed_list self.failed_list = [] self.wait_q() self.add_job(None) self.failed_list = seeds def handle_job(self, jobid): pass def re_add_failed_job(self, jobid): if jobid.has_key('content'): jobid.pop('content') if jobid.has_key('url'): jobid.pop('url') cnt = jobid.get('_failed_cnt_', 0) + 1 jobid['_failed_cnt_'] = cnt self.failed_list.append(jobid) def save_invalid_job(self, jobid): cnt = jobid.get('_invalid_cnt_', 0) + 1 jobid['_invalid_cnt_'] = cnt if cnt < 2: self.re_add_failed_job(jobid) else: if jobid.has_key('content'): jobid.pop('content') if jobid.has_key('url'): jobid.pop('url') self.invalid_list.append(jobid) def request_list(self, seeds, kldm, bkcc): remains = [] if self.post_kldms: self.post_kldm_bkcc_for_session(kldm, bkcc) for seed in seeds: if seed['kldm'] == kldm and bkcc == seed['bkcc']: self.add_main_job(seed) else: remains.append(seed) else: for seed in seeds: self.add_main_job(seed) return remains def run_job(self, jobid): if self.pre_job(jobid): return if not jobid.has_key('content'): self.re_add_failed_job(jobid) return detail_content = jobid['content'] if detail_content is None: self.re_add_failed_job(jobid) return try: if self._check_result(detail_content.text, jobid, jobid['url']): '''exception is found and handled''' return except InvalidQueryError as e: logging.info(e.message) self.save_invalid_job(jobid) return except Exception as e: logging.info(e.message) self.re_add_failed_job(jobid) return if not jobid.has_key('url'): print jobid self.re_add_failed_job(jobid) return jid = self.get_job_id(jobid) print 'saving %s==>%s' % (jid, len(detail_content.text)) self.pagestore.save(int(time.time()), jid, jobid['url'], detail_content.text) def get_job_title(self, jobid): raise NotImplementedError('Virtual method called') def new_page_store(self, spider, tag): raise NotImplementedError('Virtual method called') def get_job_id(self, jobid): raise NotImplementedError('Virtual method called') def parse_page(self, jobid, content): raise NotImplementedError('Virtual method called') def get_url(self, jobid): raise NotImplementedError('Virtual method called') def report_job(self, jobid): raise NotImplementedError('Virtual method called') def add_job(self, jobid, mainjob=False): if jobid is None: super(ChsiSpider, self).add_job(jobid, mainjob) return url = self.get_url(jobid) count = 3 content = None while count > 0 and not content: content = self.request_content(jobid, url) count -= 1 if content is None: self.re_add_failed_job(jobid) return jobid['content'] = content jobid['url'] = url self.report_job(jobid) super(ChsiSpider, self).add_job(jobid, mainjob) self.parse_page(jobid, content) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += 'seeds: %s\n' % self.seeds msg += "saved: %d\n" % self.pagestore.saved_count msg += 'captcha times: %s\n' % self._captcha_times msg += 'remain seeds: %d\n' % len(self.failed_list) msg += 'invalid seeds: %d\n' % len(self.invalid_list) for item in self.except_state: msg += '%s: %d\n' % (item.name(), item.count()) spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) print 'remain seeds', len(self.failed_list) print 'invalid seeds', len(self.invalid_list) for seed in self.invalid_list: self.invalid_saver.add(str(seed)) self.invalid_saver.flush() for seed in self.failed_list: self.failed_saver.add(str(seed)) self.failed_saver.flush() elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass def parse_seed(self, param): raise NotImplementedError('Virtual method called') def request_content(self, jobid, url): raise NotImplementedError('Virtual method called')
class BjListSpider(BJSpider): def __init__(self, threadcnt, last_page=None, total_page=22305, save_file='seeds.dat', sleep=0.0, proxy_life=180): super(BjListSpider, self).__init__(threadcnt, 'BjListSpider', proxy_life=proxy_life) self.test_mode = False self.sleep = sleep self.zero_link_count = 0 self.lock = threading.Lock() self._shutdown = False self.result_saver = LinkSaver(save_file, 'a') self.captcha = FoodMakerExtendLock(threadcnt - 1) self.last_page = last_page self.total_page = total_page def dispatch(self): if self.last_page is not None and self.last_page <= self.total_page: for page in range(self.last_page, self.total_page + 1): self.add_main_job({ 'type': 'list', 'url': 'http://www.bjcourt.gov.cn/cpws/index.htm?page=%s' % page }) else: self.add_main_job({ 'type': 'main', 'url': 'http://www.bjcourt.gov.cn/cpws/index.htm' }) time.sleep(3) self.wait_q() self.add_job(None, True) def with_sleep_request_url(self, url, **kwargs): time.sleep(self.sleep) return self.request_url(url, **kwargs) def _dec_worker(self): self.captcha.decrease() super(BjListSpider, self)._dec_worker() def run_job(self, jobid): if not isinstance(jobid, dict): return if self._shutdown: return jt = jobid['type'] url = jobid['url'] time.sleep(2) con = self.with_sleep_request_url(url, timeout=10) if self.check_exception(con, jobid): return m = re.search('yzmInput', con.text) if m: print self.get_tid(), url, ' need captcha' con = self.resolve_captcha(url) if self.check_exception(con, jobid): return if re.search(r'yzmInput', con.text): self._shutdown = True self.link_saver.add('%d,%d,%s' % (2, 0, url)) return if 'main' == jt: m = re.search(ur'您搜到了\s*<em>([0-9]+)</em>\s*条符合条件的文书', con.text, re.S) if not m: if re.search(r'yzmInput', con.text): self._shutdown = True self.link_saver.add('%d,%d,%s' % (2, 0, url)) return papercnt = int(m.group(1)) if papercnt <= 0: print '哎呀,这里没用文书', url with self.lock: self.zero_link_count += 1 return print 'there are %d papers on %s' % (papercnt, url) self.link_saver.add('%d,%d,%s' % (1, papercnt, url)) n_url = url if n_url.find('?') < 0: n_url += '?' elif n_url[-1] != '&': n_url += '&' for page in range((papercnt + 10) / 20 + 1, 1, -1): self.add_job({'type': 'list', 'url': n_url + 'page=%s' % page}) ids = re.findall(r'\/cpws\/paperView.htm\?id=(\d+)', con.text) if not ids or len(ids) == 0: print 'cannot find any paper on', url return print 'add %d papers from %s' % (len(ids), url) for id in ids: self.result_saver.add(id) def split_url(self, url): urls = CData.split_param(url) for u in urls: self.add_job({'type': 'main', 'url': u}) def event_handler(self, evt, msg, **kwargs): super(BjListSpider, self).event_handler(evt, msg, **kwargs) if evt == 'DONE': self.result_saver.flush() msg += 'zero count: %d\n' % self.zero_link_count msg += 'captcha times: %d\n' % self.captcha_times spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class BaseChsiSpider(BaseGkChsiFsxSpider): def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds', recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''): super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.parser = HTMLParser.HTMLParser() self.info_saver = LinkSaver(tag + '_detail_data') self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag) self.year = year self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s' self.failed_list = [] self.last_request_time = time.time() def dispatch(self): # read all seeds seeds = [] with open(self.seeds, 'r') as f: for l in f: if l[0] == '{': data = eval(l.strip()) else: param = l.strip().split(',') if len(param) != 8: logging.warn('invalid seeds %s', l) continue data = {'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0, 'years': param[5], 'zydm': param[7], 'zymc': param[8].encode('utf-8')} if self.year == data['years'] and not self.pagestore.find_any( self.pagestore.channel + '://' + self.get_jobid(data)): seeds.append(data) print 'load ', len(seeds), 'jobs' count = 10 while len(seeds) > 0 and count > 0: count += 1 logging.info('remain tries %d', count) for kldm in self.kldms: for bkcc in self.bkccs: seeds = self.request_list(seeds, kldm, bkcc) logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s', len(seeds), len(self.failed_list), kldm, bkcc) seeds += self.failed_list self.failed_list = [] time.sleep(2) self.wait_q() self.add_job(None) print 'remain seeds', len(seeds) for seed in seeds: self.failed_saver.add(seed) self.failed_saver.flush() self.failed_list = seeds def handle_job(self, jobid): pass def request_list(self, seeds, kldm, bkcc): self.post_kldm_bkcc_for_session(kldm, bkcc) remains = [] for seed in seeds: if seed['kldm'] == kldm and bkcc == seed['bkcc']: self.add_main_job(seed) else: remains.append(seed) return remains def run_job(self, jobid): if not jobid.has_key('content'): if jobid not in self.failed_list: self.failed_list.append(jobid) return detail_content = jobid['content'] jtitle = '%s/%s/%s/%s/%s/%s' % ( jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start']) self.pagestore.save(int(time.time()), '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10), jobid['url'], detail_content.text) def add_job(self, jobid, mainjob=False): if jobid is None: super(BaseChsiSpider, self).add_job(jobid) return logging.info('fetching special %s,%s', jobid['zymc'], jobid['zydm']) detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], jobid['zydm'], jobid['start']) content = self.fetch_content(jobid, detail_url) if content is None: # exception is handle return jobid['content'] = content jobid['url'] = detail_url super(BaseChsiSpider, self).add_job(jobid, True) if 0 == jobid['start']: m = re.search(ur'共 (\d+) 页', content.text) if not m: logging.warn('failed to find page count %s,%s,%s', jobid['kldm'], jobid['bkcc'], detail_url) return page_cnt = int(m.group(1)) if page_cnt <= 1: return for p in range(1, page_cnt): job = copy.deepcopy(jobid) job['start'] = p * 10 self.add_main_job(job) def get_jobid(self, jobid): return '%s/%s/%s/%s/%s/%s/%s/%s' % ( jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start'], jobid['zydm'], int(jobid['start']) / 10) def fetch_content(self, jobid, detail_url): detail_content = self.request_url(detail_url, allow_redirects=20) if detail_content is None: self.failed_list.append(jobid) return try: if not self._check_result(detail_content.text, jobid, detail_url): self.failed_list.append(jobid) else: return detail_content except Exception as e: logging.info(e.message) self.failed_list.append(jobid) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += 'seeds: %s\n' % self.seeds msg += "saved: %d\n" % self.pagestore.saved_count msg += 'captcha times: %s' % self._captcha_times msg += 'remain seeds: %d\n' % len(self.failed_list) spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class PatentAbstractSpider(ZhuanliBaseSpider, Main): """专利摘要爬虫""" def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'): ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90) Main.__init__(self) self.short_tag = 't:m:s:r:o:h:v:' self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output='] self.seeds = seeds self.page_size = 20 # 3或者10,20 self.pagestore = PatentAbstractStore('abstract') self.failed_saver = FailedJobSaver('failed_job.txt') self.seed_saver = LinkSaver('seed.year.txt', 'a+') self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+') self.mode = mode self.__version = '1.0.0' self.utils = threading.local() self.sp_errors = OrderedDict() self.pre_save_count = 0 self.properties = PropertiesManager() self.can_load_seed = True def output(self, args): print '_patent_spider.py: %s' % args def version(self): print '_patent_spider.py %s' % self.__version def usage(self): print '_patent_spider.py usage:' print '-h, --help: print help message.' print '-v, --version: print script version' print '-o, --output: input an output verb' print '-t, --threads: thread count ' print '-m, --mode: mode,if not id then will be abstract mode' print '-r, --recover: recover,1 or True for recover mode' print '-s, --seeds: seeds file' def _set_proxy(self, kwargs, selproxy): super(PatentAbstractSpider, self)._set_proxy(kwargs, selproxy) setattr(self.utils, 'proxy', selproxy) def handle(self, opts): for o, a in opts: if o in ('-h', '--help'): self.usage() sys.exit(1) elif o in ('-v', '--version'): self.version() sys.exit(0) elif o in ('-o', '--output'): self.output(a) sys.exit(0) elif o in ('-t', '--threads'): self.thread_count = int(a) elif o in ('-m', '--mode'): self.mode = a elif o in ('-s', '--seeds'): self.seeds = a elif o in ('-r', '--recover'): self.recover = True if (a == '1' or a == 'True') else False else: print 'unhandled option' sys.exit(3) if self.mode != 'id': self.mode = 'abs' if self.mode != 'id' and not os.path.exists(self.seeds): print 'seed file %s not exists' % self.seeds sys.exit(1) count = 3 while count > 0: self.sp_proxies = OrderedDict() if self.mode == 'id': # self.set_proxy('183.111.169.203:8080', len(job.sp_proxies)) self.set_proxy('192.168.1.39:3428:ipin:helloipin', len(job.sp_proxies)) else: proxies = KuaidailiProxyManager.load_proxy(100) print 'load %d proxies from kuaidaili' % proxies['data']['count'] if proxies['data']['count'] > 0: self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0) # proxies = KuaidailiProxyManager.load_proxy(50) # print 'load %d proxies from kuaidaili' % proxies['data']['count'] # if proxies['data']['count'] > 0: # self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0) self.run() count -= 1 def load_proxy(self, fn, index=-1, auto_change=True): super(PatentAbstractSpider, self).load_proxy(fn, index, auto_change) with self.locker: self.sp_errors.clear() for proxy in self.sp_proxies.iterkeys(): self.sp_proxies[proxy] = 0 def set_proxy(self, prs, index=-1, auto_change=True): with self.locker: if isinstance(prs, list): for p in prs: self.sp_errors[p] = 0 elif isinstance(prs, str) or isinstance(prs, unicode): self.sp_errors[prs] = 0 super(PatentAbstractSpider, self).set_proxy(prs, index, auto_change) @staticmethod def gen_list_seed(): now = datetime.now() this_year = int(now.strftime('%Y')) this_month = int(now.strftime('%m')) types = ['fmgb', 'fmsq', 'xxsq', 'wgsq'] seeds = [] for year in range(1985, this_year): for month in range(1, 13): for t in types: seeds.append( {'type': t, 'index': 1, 'time': '%s%s' % (year, (month if month > 9 else '0%s' % month))}) for month in range(1, this_month): for t in types: seeds.append( {'type': t, 'index': 1, 'time': '%s%s' % (this_year, (month if month > 9 else '0%s' % month))}) return seeds def load_abstract_seeds(self, seed_file, limit=1000000): seeds = [] last_position = self.properties.get('position', 0) f = open(seed_file, 'r') count = 0 f.seek(last_position) while count < limit: l = f.readline() if not l: # 文件结束,不能再读 self.can_load_seed = False break res = l.strip().split(',') if len(res) < 3: print 'invalid seeds:', l else: seeds.append({'type': res[1], 'id': res[0], 'code': res[2]}) count += 1 last_position = f.tell() self.properties.set('position', last_position) self.properties.save() f.close() return seeds def get_id_seeds(self): raw_seeds = self.gen_list_seed() rds = self.job_log.readlines() '''get done jobs''' done_jobs = {} for job in rds: if '[' == job[0]: continue js = job.strip().split('-') done_jobs['%s-%s' % (js[0], js[1])] = {} done_jobs['%s-%s' % (js[0], js[1])]['pages'] = int(js[2]) done_jobs['%s-%s' % (js[0], js[1])]['current'] = 1 '''load done seeds''' dss = self.seed_saver.readlines() for ds in dss: sd = ds.strip().split(',') if len(sd) < 4: print 'invalid seed', ds continue js = sd[3].split('-') sid = '%s-%s' % (js[0], js[1]) page = int(js[2]) if done_jobs.has_key(sid) and done_jobs[sid]['current'] < page: done_jobs[sid]['current'] = page seeds = [] for seed in raw_seeds: sid = seed['time'] + '-' + seed['type'] if done_jobs.has_key(sid): if done_jobs[sid]['pages'] > done_jobs[sid]['current'] > 1: for page in range(done_jobs[sid]['current'] + 1, done_jobs[sid]['pages'] + 1): s = copy.deepcopy(seed) s['index'] = page seeds.append(s) else: seeds.append(seed) logging.info('load %s list seeds', len(seeds)) return seeds def get_abstract_seeds(self, limit=100000): rawseeds = self.load_abstract_seeds(self.seeds, limit) seeds = [] for s in rawseeds: if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + s['id']): seeds.append(s) if len(seeds) >= limit: break logging.info('load %d abstract seeds', len(seeds)) return seeds def report(self): super(PatentAbstractSpider, self).report() self.job_log.flush() self.seed_saver.flush() count = self.pagestore.saved_count - self.pre_save_count self.pre_save_count = self.pagestore.saved_count print 'save %d doc in this minute' % count def dispatch(self): self.failed_saver.tag() if self.mode == 'id': seeds = self.get_id_seeds() for seed in seeds: self.add_main_job(seed) else: count = 10 ever_loaded = False while count > 0 and self.can_load_seed: seeds = self.get_abstract_seeds() if len(seeds) > 0: ever_loaded = True for seed in seeds: self.add_main_job(seed) time.sleep(2) self.wait_q() elif ever_loaded: count -= 1 time.sleep(100) time.sleep(2) self.wait_q() self.add_job(None) @staticmethod def extract_seed_id(pub, app, count): return '%s-%s/%s-%s/%s' % ( pub[0], pub[1], app[0] if (app[0] != '-') else '', app[1] if (app[1] != '-') else '', count) @staticmethod def parse_seed(seed): v = seed.split(',') if len(v) != 7: print 'invalid seed', seed return [] return [[v[1][1:], v[2][:-1]], [v[3][1:], v[4][:-1]], int(v[6])] @staticmethod def get_query_word(jobid): word = '公开(公告)日=%s' % jobid['time'] return word def _on_shutdown(self, jobid): self.failed_saver.save('2,%s' % str(jobid)) return def handle_id_job(self, jobid): strword = self.get_query_word(jobid) url = self.form_query_url(strword, page=jobid['index'], size=self.page_size, selected=jobid['type'], showtype=0) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): print 'exception encounter', jobid return if re.search(u'<title>错误页面</title>', con.text): print '错误页面', jobid if not self.re_add_job(jobid): self.failed_saver.save(str(jobid)) return patents = re.findall(r'<a href="javascript:zl_xm\(\'([\d\w]+)\',\'(\w+)\',\'([\w\d]+)\'\);">[\d\w]+</a>', con.text) print '[%d]%s-%s-%s' % (len(patents), jobid['time'], jobid['type'], jobid['index']) if 0 == len(patents): self.job_log.add('[%d]%s-%s-%s,%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'], con.code)) self.re_add_job(jobid) return for p in patents: if len(p) != 3: logging.warn('invalid pattern matched:%s,%s', str(p), str(jobid)) self.failed_saver.save('1,%s' % str(jobid)) else: self.seed_saver.add( '%s,%s,%s,%s-%s-%d' % (p[0], p[1], p[2], jobid['time'], jobid['type'], jobid['index'])) if 1 == jobid['index']: m = re.search(r'javascript:if\(event.keyCode == 13\) zl_tz\((\d+)\)', con.text) if m: pagecnt = int(m.group(1)) print '[%d][%d]%s-%s-%d' % (len(patents), pagecnt, jobid['time'], jobid['type'], jobid['index']) self.job_log.add('%s-%s-%s' % (jobid['time'], jobid['type'], pagecnt)) for page in range(2, pagecnt + 1): job = copy.deepcopy(jobid) job['_failcnt_'] = 0 job['index'] = page self.add_job(job) else: print 'failed to find count[%d]%s-%s-[%d]' % (len(patents), jobid['time'], jobid['type'], 0) logging.warn('failed to find page count:%s-%s-%s', jobid['time'], jobid['type'], jobid['index']) def handle_abstract_seed(self, jobid): qword = quote('申请号=\'%s\' and %s=1' % (jobid['id'], jobid['code'])) url = 'http://epub.sipo.gov.cn/patentdetail.action?strSources=%s&strWhere=%s&strLicenseCode=&pageSize=6&pageNow=1' % ( jobid['type'], qword) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): print 'exception encounter', jobid return if re.search(u'<title>错误页面</title>', con.text): print '错误页面', jobid if not self.re_add_job(jobid): self.failed_saver.save(str(jobid)) return print 'success:%s-%s-%s' % (jobid['id'], jobid['type'], jobid['code']) self.pagestore.save(int(time.time()), jobid['id'], url, con.text) def run_job(self, jobid): if self.check_shutdown(jobid): return try: if self.mode == 'id': self.handle_id_job(jobid) else: self.handle_abstract_seed(jobid) except RuntimeError as e: if 'no proxy' in e.message: self.re_add_job(jobid) self.reload_proxy() return else: raise def reload_proxy(self): prs = {} count = 3 while count > 0: if 'id' == self.mode: prs = KuaidailiProxyManager.load_proxy(20) else: prs = KuaidailiProxyManager.load_proxy(100) if prs['data']['count'] > 0: break count -= 1 if count <= 0 or not prs.has_key('data') or not prs['data'].has_key('count') or \ prs['data'][ 'count'] <= 0: self._shutdown() logging.error('cannot load any proxy') spider.util.sendmail(['*****@*****.**'], 'Proxy Error', 'Cannot load any proxy:%s,%s' % (self._name, self.mode)) return print 'load %d proxies from kuaidaili' % prs['data']['count'] self.set_proxy(prs['data']['proxy_list'], 15 if (prs['data']['count'] > 15) else 0) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.job_log.flush() msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) def proxy_error(self): proxy = getattr(self.utils, 'proxy') if proxy is not None: with self.locker: try: if self.sp_errors[proxy] < 5: self.sp_errors[proxy] += 1 else: self.sp_proxies.pop(proxy) if len(self.sp_proxies) == 0: self.reload_proxy() except KeyError: pass def on_proxy_error(self, con, jobid): self.proxy_error() self.re_add_job(jobid) return True def on_other_400_exception(self, con, jobid): if con.code == 403: self.proxy_error() self.re_add_job(jobid) return True def on_other_500_exception(self, con, jobid): if 504 == con.code and re.search('proxy', con.text, re.I): self.proxy_error() self.re_add_job(jobid) return True else: return super(PatentAbstractSpider, self).on_other_500_exception(con, jobid)