示例#1
0
    urls = get_urls(URLS_DB)

    client = GearmanClient(['10.61.0.145'])
    tasks = Taskset()
    TASK_URLS_NUM = 100
    # disptribute task
    i = 0
    while i < len(urls):
        sub_urls = urls[i:i + TASK_URLS_NUM]
        workload = '\n'.join(sub_urls)
        t = Task1('crawl',
                  workload,
                  str(i),
                  timeout=TASK1_TIMEOUT,
                  retry_count=1)
        tasks.add(t)
        print "add task:%s" % t.uniq
        i += TASK_URLS_NUM
        # test
        pass

    # 0.init database for return result from worker
    print "0.initialize database for results."
    tmps = ["%s int" % cate for cate in CATES]
    cates_str = ','.join(tmps)
    tb_sql = "create table %s (word text primary key,%s,total int);" % (
        RAW_WORDS_TB, cates_str)
    print tb_sql
    init_db(TASK1_RESULT_DB, tb_sql)
    # 1.run the tasks in parallel
    print "1.Preprocess tasks:"
示例#2
0
    return urls


if __name__ == "__main__":
    urls = get_urls(URLS_DB)

    client = GearmanClient(["10.61.0.145"])
    tasks = Taskset()
    TASK_URLS_NUM = 100
    # disptribute task
    i = 0
    while i < len(urls):
        sub_urls = urls[i : i + TASK_URLS_NUM]
        workload = "\n".join(sub_urls)
        t = Task1("crawl", workload, str(i), timeout=TASK1_TIMEOUT, retry_count=1)
        tasks.add(t)
        print "add task:%s" % t.uniq
        i += TASK_URLS_NUM
        # test
        pass

    # 0.init database for return result from worker
    print "0.initialize database for results."
    tmps = ["%s int" % cate for cate in CATES]
    cates_str = ",".join(tmps)
    tb_sql = "create table %s (word text primary key,%s,total int);" % (RAW_WORDS_TB, cates_str)
    print tb_sql
    init_db(TASK1_RESULT_DB, tb_sql)
    # 1.run the tasks in parallel
    print "1.Preprocess tasks:"
    client.do_taskset(tasks)