def crawl_okaybuy(logger, store_name): job_rules = okaybuy.gen_job_rules(store_name) bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)
def crawl_taobao(logger, store_name): job_rules = taobao.gen_job_rules(store_name) bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)
def test_simple_crawling_job(logger): seed_url = "file://%s/demosite/index.html" % (os.path.abspath('.'),) sample_job_rules = { "desc": "Example job, crawing the test site, extract prod desc", "name": "sample", "num_workers": 1, "worker_params": { "max_crawler_failed_cnt": 3, "max_crawler_timeout": 30, "crawler_retry_interval": 10, "pause_on_reinject": 0.1, "pause_before_fetch": 0, "pause_when_notask": 0.1, }, "linkdb": { "class_name": "bee.SqliteLinkDB", "params" : { "name": "sample_site.link.db", } }, "task_queue": { "class_name": "bee.MemTaskQueue", "params": { }, }, "output": { "class_name": "bee.JsonDumper", "params": { "filename": "sample_site.out.json" } }, "fetcher_factory": { "rules": { "simple_http_get": { "class_name": "bee.SimpleHTTPFetcher", "params": { "timeout": 10, "user_agent": "Bee: picking good stuffs", "proxy_host": "", "proxy_port": 0, "from_encoding": "utf-8" } } } }, "seeker_factory": { "rules": { "simple_seek": { "class_name": "bee.RuleBasedSeeker", "params": { "rules": [ [ ".*/cat\d+\.html", 3, 60, "simple_http_get", ["simple_seek"], [], False ], [ ".*prod\d+\.html", 1, 60, "simple_http_get", [], ["simple_miner"], False ], ], } }, } }, "miner_factory": { "rules": { "simple_miner": { "class_name": "ProductMiner", "params": { } }, } }, "seed_tasks": [ { "url": seed_url, "fetcher": "simple_http_get", "seekers": ["simple_seek"], "miners": [], "hop": 0, "revisit_interval": 60, } ] } bee.run_job(sample_job_rules, max_idle_cnt=3, job_status_interval=1, logger=logger)
def crawl_paixie(logger, store_name): job_rules = paixie.gen_job_rules(store_name) bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)