示例#1
0
文件: jdutil.py 项目: lancong/Spiders
def jdholder2(queue, redis_client, JDBase, outlog, cookie=None, start=10, end=40):
    if cookie:
        JDBase.set_cookie(cookie)
    # JDBase.set_succeed_log_path(succeedlog)
    # JDBase.set_failed_log_path(failedlog)
    JDBase.set_redis_client(redis_client)
    JDBase.set_result_save_path(outlog)
    JDBase.set_useragent(fetch_util.get_pc_useragent())
    task_id = queue.get()[0]
    task_id = fetch_util.byte_to_str(task_id)
    JDBase.set_request_path(task_id)
    JDBase.execute()

    fetch_util.print_log('process  ' + JDBase.getshowlog() + '\t\n')

    # 获取结果是否成功
    issucceed = JDBase.get_result()

    if issucceed:
        # 保存成功flag
        JDBase.save_succeed_log(task_id)
    else:
        # 保存失败flag
        JDBase.save_failed_log(task_id)

        # 睡眠
    JDBase.sleep(start, end)
示例#2
0
文件: jdutil.py 项目: lancong/Spiders
def init_task2(alltaskreadlines, succeedfile, failedfile):
    beforetasks = []
    if os.path.exists(succeedfile) and os.path.getsize(succeedfile) > 0:
        slls = fetch_util.read(succeedfile)
        while True:
            try:
                sll = slls.__next__()
                beforetasks = fetch_util.remove_duplicate(sll, alltaskreadlines)
            except StopIteration:
                break
    else:
        beforetasks = alltaskreadlines

    nowtasks = []
    if os.path.exists(failedfile) and os.path.getsize(failedfile) > 0:
        flls = fetch_util.read(failedfile)
        while True:
            try:
                fll = flls.__next__()
                nowtasks = fetch_util.remove_duplicate(fll, beforetasks)
            except StopIteration:
                break
    else:
        nowtasks = beforetasks
        fetch_util.print_log("failed file size = 0")
    return nowtasks
示例#3
0
文件: jdutil.py 项目: lancong/Spiders
def jdholder(tasks, JDBase, succeedlog, failedlog, outlog, cookie=None, start=10, end=40):
    jd = JDBase

    # if cookie:
    jd.set_cookie(cookie)
    jd.set_succeed_log_path(succeedlog)
    jd.set_failed_log_path(failedlog)
    jd.set_result_save_path(outlog)

    taskslen = len(tasks)

    count = 0

    for task in tasks:
        count += 1

        jd.set_useragent(fetch_util.get_pc_useragent())
        jd.set_request_path(task)
        jd.execute()

        fetch_util.print_log('process [' + str(count) + '/' + str(taskslen) + '] ' + ' ' + jd.getshowlog() + '\t\n')

        # 获取结果是否成功
        issucceed = jd.get_result()

        if issucceed:
            # 保存成功flag
            jd.save_succeed_log(task)
        else:
            # 保存失败flag
            jd.save_failed_log(task)

        # 睡眠
        jd.sleep(start, end)
示例#4
0
文件: main.py 项目: lancong/Spiders
def func(lists, maincategory, outfile):
    if lists and len(lists) > 0:

        alllen = len(lists)
        count = 0

        for currenturl in lists:

            count += 1

            pageinfo = PageInfo(currenturl)
            appname = pageinfo.getappname()
            category = pageinfo.getcategory()
            if not category:
                category = 'unknow'
            tag = pageinfo.gettag()
            if not tag:
                tag = 'unknow'

            # 打印日志
            # captureutil.printlog(currenturl + '\t' + appname)
            fetch_util.print_log('[' + str(count) + '/' + str(alllen) + '] ' + currenturl)

            outinfo = currenturl + '\t' + maincategory + '>' + appname + '\tc:' + category + '\tt:' + tag

            # 写入结果
            fetch_util.write(outinfo, outfile)

            # 随机休眠几秒
            sleep(fetch_util.randnum(10, 40))
示例#5
0
    def match_type1(self, html, requesturl, sources):
        divtext = sources.get_text()
        currentresult = requesturl + '\t' + fetch_util.replace_some_string(
            divtext, '>', '>')
        itmenamehtml = html.find("div", {'id': 'name'})
        if itmenamehtml:
            h1itemname = itmenamehtml.find('h1')
            itemname = h1itemname.contents[0]
            if itemname:
                currentresult += '>' + itemname

        # 保存请求成功的path
        price = get_price(requesturl)
        self.save_result(currentresult + price)
        fetch_util.print_log(currentresult + price)
        self.set_print_log(requesturl + ' succeed')
        self.set_result(True)
示例#6
0
文件: jdmain.py 项目: lancong/Spiders
def main():
    # 开始时间
    starttime = time()

    # 日志输出目录
    dirpath = jdconfig.jd_out_dir

    fetch_util.mkdirs(dirpath)

    # 输出结果文件
    outlog = dirpath + jdconfig.jd_out

    redis_pool = redis_util.get_redis_pool_connection()
    redis_client = redis.Redis(connection_pool=redis_pool)

    # all_task_size = task_dispatch.get_all_task_size(redis_client, 'jd_20161024')

    task_iter = task_dispatch.get_all_task_iter(redis_client, 'jd_20161024')

    queue = Queue()

    task_dispatch.get_task_queue(queue, task_iter)

    task_size = queue.qsize()

    if task_size == 0:
        fetch_util.print_log("任务总数为0,结束任务")
        return

    fetch_util.print_log("任务总数: " + str(task_size))

    # 设置cookie
    cookie = jdutil.jd_pc_cookie('beijing')

    jd = JDPage()

    while True:

        threads = []
        for i in range(jdconfig.thread_num):
            # th = threading.Thread(target=jdutil.jdholder, args=(task, jd, succeedlog, failedlog, outlog, cookie, 0, 5))
            th = threading.Thread(target=jdutil.jdholder2, args=(queue, redis_client, jd, outlog, cookie, 5, 10))
            th.start()
            threads.append(th)
        for th in threads:
            th.join()

    # 结束时间
    endtime = time()

    fetch_util.print_log("网页数据获取结束,耗时: " + str(float(endtime - starttime) / 60) + "分钟")

    # 写入耗时结果
    costtimepath = dirpath + 'jd_' + fetch_util.get_time_yyyymmddhhmmss() + '.txt'
    fetch_util.write(str(float(endtime - starttime) / 60), costtimepath, 'w')

    pass
示例#7
0
文件: jdutil.py 项目: lancong/Spiders
def init_task(urlsfile, succeedfile, failedfile):
    exist = fetch_util.file_exist(urlsfile)

    if not exist:
        fetch_util.print_log(urlsfile + " not exist")
        return []

    with  open(urlsfile, 'r') as alltaskopener:
        alltaskreadlines = alltaskopener.readlines()

    succeedfilereadlines = []
    if os.path.exists(succeedfile):
        with open(succeedfile, "r") as succeedfileopener:
            succeedfilereadlines = succeedfileopener.readlines()

    failedfilereadlines = []
    if os.path.exists(failedfile):
        with  open(failedfile, "r") as failedfileopener:
            failedfilereadlines = failedfileopener.readlines()

    beforetasks = fetch_util.remove_duplicate(succeedfilereadlines, alltaskreadlines)
    nowtasks = fetch_util.remove_duplicate(failedfilereadlines, beforetasks)

    return nowtasks
示例#8
0
文件: jdmain.py 项目: lancong/Spiders
def jdholder2(task, JDBase, succeedlog, failedlog, outlog, cookie=None, start=10, end=40):
    if cookie:
        JDBase.set_cookie(cookie)
    JDBase.set_succeed_log_path(succeedlog)
    JDBase.set_failed_log_path(failedlog)
    JDBase.set_result_save_path(outlog)
    JDBase.set_useragent(fetch_util.get_pc_useragent())
    JDBase.set_request_path(task)
    JDBase.execute()

    fetch_util.print_log('process  ' + JDBase.getshowlog() + '\t\n')

    # 获取结果是否成功
    issucceed = JDBase.get_result()

    if issucceed:
        # 保存成功flag
        JDBase.save_succeed_log(task)
    else:
        # 保存失败flag
        JDBase.save_failed_log(task)

        # 睡眠
    JDBase.sleep(start, end)
示例#9
0
def save_result(path, request_url, bread_tag_name, price):
    if bread_tag_name:
        save_info = request_url + '\t' + bread_tag_name + '\t' + price
        fetch_util.print_log(request_url + ' succeed')
        fetch_util.print_log_debug(save_info)

        if path:
            fetch_util.write(save_info, path)
        else:
            fetch_util.print_log('****** 未设置结果存储路径!!! ******')
    else:
        fetch_util.print_log(request_url + ' failed')
示例#10
0
文件: main.py 项目: lancong/Spiders
def main():

    # 指定大类别url
    outfile = '/Users/Lan/TestDir/out/wandoujia.txt'
    # 文件输出位置
    specurls = ['http://www.wandoujia.com/category/396']


    # outfile = wandoujiaconfig.outfile

    # specurls = wandoujiaconfig.specurls

    # specurls = ['http://www.wandoujia.com/category/382', 'http://www.wandoujia.com/category/388',
    #             'http://www.wandoujia.com/category/402', 'http://www.wandoujia.com/category/392']

    allurls = []

    fetch_util.print_log('update request urls ...')

    for specurl in specurls:
        maincategoryurls = MainCategoryUrls(specurl)
        url = maincategoryurls.geturls()
        allurls.append(url)

    urls = fetch_util.liststolist(allurls)

    fetch_util.print_log('update request urls finished, len: ' + str(len(urls)))

    for url in urls:

        parentpage = ParentPage(url=url)
        requesturls = parentpage.getpageurls()
        if requesturls and len(requesturls) > 0:
            for requesturl in requesturls:
                parentpageurl = ParentPageUrl(requesturl)
                # 当前主大类别
                maincategory = parentpageurl.getcategory()
                if not maincategory:
                    maincategory = 'unknow'
                # 当前页面可请求urls
                currenturls = parentpageurl.getcurpageurls()

                if currenturls and len(currenturls) > 0:

                    tasks = fetch_util.task_dispatch(currenturls, 10)

                    threads = []

                    for task in tasks:
                        th = threading.Thread(target=func, args=(task, maincategory, outfile))
                        th.start()
                        threads.append(th)
                        pass

                    for th in threads:
                        th.join()

        # 写入结果
        fetch_util.write('\r\n------ i am line -----\r\n', outfile)
        fetch_util.print_log("has finish: " + url)

        sleep(fetch_util.randnum(10, 30))