示例#1
0
def start_yingyongbao(**kwargs):
    hr = HandleRedis(1)
    type2 = hr.get_data_redis("yingyongbao_types")
    t = type2.split(',')[0][2:-1]
    a = 0
    for i in range(0, 2000):
        try:
            kwargs = dict(module="yingyongbao_data", data=dict(t=t, i=i))
            dicts = start(**kwargs)
            if dicts:
                log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts)))
                details = dict(module="yingyongbao_save_details", data=dicts)
                # start(**details)
                comment = dict(module="yingyongbao_save_comment", data=dicts)
                # start(**comment)
                for _ in range(1):
                    threading.Thread(target=start, kwargs=details,
                                     name=None).start()
                    threading.Thread(target=start, kwargs=comment,
                                     name=None).start()
            else:
                a += 1
            if a == 10:
                break
        except Exception as e:
            raise e
示例#2
0
def main():
    global hr
    hr = HandleRedis(7)
    while True:
        url = hr.get_data_redis("TB_CREDIT_FJ_URL")
        if url:
            kwargs = dict(url=url)
            get_detail(**kwargs)
示例#3
0
def baidu_shixin(**kwargs):
    """
       定时任务调用失信爬虫百度失信的爬取策略
       :return:
       """
    # ip_pool=get_proxies_from_redis()
    r = HandleRedis(1)
    name = r.get_data_redis("shixin_words")
    # flag为一个开关确定是否需要重新遍历关键词
    # flag = r.get('baidushixin_flag')

    data = {}
    if name:
        name = name
    else:
        log.crawler.info('百度失信关键词遍历完毕.....')
        return
    try:
        pn = 0
        hr = HandleRedis(7)
        while isinstance(pn, int):
            kwargs = dict(module='baidu', data=dict(name=name, pn=pn))
            # if ip_pool:
            #     proxies = random.choice(ip_pool)
            # else:
            #     ip_pool = get_proxies_from_redis()
            #     proxies = ip_pool.pop()
            # kwargs['data']['proxies'] = proxies
            log.crawler.info("crawler name is:{},pn is:{}".format(name, pn))
            result_dict = start(**kwargs)
            qiye = result_dict['enterprise']
            person = result_dict['person']
            if qiye:
                hr.cache_list_redis('TB_SHIXIN_ENTERPRISE', qiye)
                log.crawler.info(
                    "cache qiye shixin into redis success length is:%s" %
                    len(qiye))
            if person:
                hr.cache_list_redis('TB_SHIXIN_PERSON', person)
                log.crawler.info(
                    "cache person shixin into redis success length is:%s" %
                    len(person))
            pn = result_dict["pn"]
            if pn == "finished":
                log.crawler.info("数据请求完毕name:{},pn:{}".format(name, pn))
                break
            elif pn == 2000:
                break
            else:
                pn += 10
    except Exception as err:
        log.error.info('百度失信爬虫发生异常,信息为:\n%s' % err)
示例#4
0
def start_wenshu_peichang(**kwargs):
    r = HandleRedis(1)
    name = r.get_data_redis("wenshu_keys")
    if name:
        name = name
        print(name)
    else:
        log.crawler.info('裁判文书关键词遍历完毕.....')
        return
    index = 1
    while True:
        log.crawler.info("*" * 80)
        log.crawler.info("start crawler wenshu page is:%d" % index)
        kwargs = dict(module="wenshu_peichang_data",
                      data=dict(key=name, index=index),
                      proxies=True)
        items = start(**kwargs)
        log.crawler.info("获取的文件ID长度为:%d" % (len(items) - 1))
        if len(items) == 1:
            break
        if items:
            run_eval = items[0]['RunEval']
        else:
            break
        # monkey.patch_all()
        # pool = Pool(20)
        threads = []
        for item in items[1:]:
            if threads:
                for t1 in threads:
                    if not t1.is_alive():
                        threads.remove(t1)
            if len(threads) == 10:
                time.sleep(3)
                continue
            data = {}
            data["docid"] = item["文书ID"]
            data["CASE_TYPE"] = item["案件类型"]
            data["CASE_TIME"] = item["裁判日期"]
            data["CASE_NAME"] = item["案件名称"]
            data["CASE_NUM"] = item["案号"]
            data["COURT_NAME"] = item["法院名称"]
            data['runeval'] = run_eval
            d = dict(module="wenshu_peichang_detail", data=data, proxies=True)
            t = threading.Thread(target=start, kwargs=d, name=None)
            t.setDaemon(True)
            t.start()
            t.join()
        for t in threads:
            t.join()
        index += 1
示例#5
0
def start_sougou(**kwargs):
    hr = HandleRedis(1)
    type2 = hr.get_data_redis("sougou_type").split(',')[0][2:-1]
    a = 0
    for i in range(0, 20000):
        try:
            kwargs = dict(module="sougou_content", data=dict(type1=type2, i=i))
            content = start(**kwargs)
            dicts = {"content": content}
            if content:
                log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts)))
                details = dict(module="sougou_save_data", data=dicts)
                # start(**details)
                for _ in range(1):
                    threading.Thread(target=start, kwargs=details,
                                     name=None).start()
            else:
                a += 1
            if a == 10:
                break
        except Exception as e:
            log.crawler.error(e)