def search_sparse(mk_url, begin_date, end_date, word, quote_key=True, name='', extract_entries=extract_entries): "搜索两个日期之间的关键词" def _mk_url(page): return mk_url(word, begin_date, end_date, page) def _before_start(): print 'Getting %s on %s from %s to %s.' % (name, word, begin_date, end_date) def _uniquify(entries): return unique(entries, lambda e: e['title']) def _after_end(entries): print 'got %d %s entries.' % (len(entries), name) sleep_interval = 2 # sleep before getting next page return crawl_search_engine(_mk_url, _before_start, rq_dom, extract_entries, has_next_page, sleep_interval, _uniquify, _after_end)
def test_crawl_search_engine(): d = {'page': 1} def _has_next_page(dom): printf('running has_next_page') if d['page'] >= 2: return False else: d['page'] += 1 return True generic_crawler.crawl_search_engine( lambda page: printf('running mk_url'), lambda: printf('running before_start'), lambda u: printf('runnning rq_dom') or (None, None), lambda d: printf('running extract_entries') or [], _has_next_page, 1, lambda es: es, lambda es: printf('running after_end'))
def test_crawl_search_engine(): d = {"page": 1} def _has_next_page(dom): printf("running has_next_page") if d["page"] >= 2: return False else: d["page"] += 1 return True generic_crawler.crawl_search_engine( lambda page: printf("running mk_url"), lambda: printf("running before_start"), lambda u: printf("runnning rq_dom") or (None, None), lambda d: printf("running extract_entries") or [], _has_next_page, 1, lambda es: es, lambda es: printf("running after_end"), )