urls = get_urls(URLS_DB) client = GearmanClient(['10.61.0.145']) tasks = Taskset() TASK_URLS_NUM = 100 # disptribute task i = 0 while i < len(urls): sub_urls = urls[i:i + TASK_URLS_NUM] workload = '\n'.join(sub_urls) t = Task1('crawl', workload, str(i), timeout=TASK1_TIMEOUT, retry_count=1) tasks.add(t) print "add task:%s" % t.uniq i += TASK_URLS_NUM # test pass # 0.init database for return result from worker print "0.initialize database for results." tmps = ["%s int" % cate for cate in CATES] cates_str = ','.join(tmps) tb_sql = "create table %s (word text primary key,%s,total int);" % ( RAW_WORDS_TB, cates_str) print tb_sql init_db(TASK1_RESULT_DB, tb_sql) # 1.run the tasks in parallel print "1.Preprocess tasks:"
return urls if __name__ == "__main__": urls = get_urls(URLS_DB) client = GearmanClient(["10.61.0.145"]) tasks = Taskset() TASK_URLS_NUM = 100 # disptribute task i = 0 while i < len(urls): sub_urls = urls[i : i + TASK_URLS_NUM] workload = "\n".join(sub_urls) t = Task1("crawl", workload, str(i), timeout=TASK1_TIMEOUT, retry_count=1) tasks.add(t) print "add task:%s" % t.uniq i += TASK_URLS_NUM # test pass # 0.init database for return result from worker print "0.initialize database for results." tmps = ["%s int" % cate for cate in CATES] cates_str = ",".join(tmps) tb_sql = "create table %s (word text primary key,%s,total int);" % (RAW_WORDS_TB, cates_str) print tb_sql init_db(TASK1_RESULT_DB, tb_sql) # 1.run the tasks in parallel print "1.Preprocess tasks:" client.do_taskset(tasks)