def start_yingyongbao(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("yingyongbao_types") t = type2.split(',')[0][2:-1] a = 0 for i in range(0, 2000): try: kwargs = dict(module="yingyongbao_data", data=dict(t=t, i=i)) dicts = start(**kwargs) if dicts: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="yingyongbao_save_details", data=dicts) # start(**details) comment = dict(module="yingyongbao_save_comment", data=dicts) # start(**comment) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() threading.Thread(target=start, kwargs=comment, name=None).start() else: a += 1 if a == 10: break except Exception as e: raise e
def main(): global hr hr = HandleRedis(7) while True: url = hr.get_data_redis("TB_CREDIT_FJ_URL") if url: kwargs = dict(url=url) get_detail(**kwargs)
def baidu_shixin(**kwargs): """ 定时任务调用失信爬虫百度失信的爬取策略 :return: """ # ip_pool=get_proxies_from_redis() r = HandleRedis(1) name = r.get_data_redis("shixin_words") # flag为一个开关确定是否需要重新遍历关键词 # flag = r.get('baidushixin_flag') data = {} if name: name = name else: log.crawler.info('百度失信关键词遍历完毕.....') return try: pn = 0 hr = HandleRedis(7) while isinstance(pn, int): kwargs = dict(module='baidu', data=dict(name=name, pn=pn)) # if ip_pool: # proxies = random.choice(ip_pool) # else: # ip_pool = get_proxies_from_redis() # proxies = ip_pool.pop() # kwargs['data']['proxies'] = proxies log.crawler.info("crawler name is:{},pn is:{}".format(name, pn)) result_dict = start(**kwargs) qiye = result_dict['enterprise'] person = result_dict['person'] if qiye: hr.cache_list_redis('TB_SHIXIN_ENTERPRISE', qiye) log.crawler.info( "cache qiye shixin into redis success length is:%s" % len(qiye)) if person: hr.cache_list_redis('TB_SHIXIN_PERSON', person) log.crawler.info( "cache person shixin into redis success length is:%s" % len(person)) pn = result_dict["pn"] if pn == "finished": log.crawler.info("数据请求完毕name:{},pn:{}".format(name, pn)) break elif pn == 2000: break else: pn += 10 except Exception as err: log.error.info('百度失信爬虫发生异常,信息为:\n%s' % err)
def start_wenshu_peichang(**kwargs): r = HandleRedis(1) name = r.get_data_redis("wenshu_keys") if name: name = name print(name) else: log.crawler.info('裁判文书关键词遍历完毕.....') return index = 1 while True: log.crawler.info("*" * 80) log.crawler.info("start crawler wenshu page is:%d" % index) kwargs = dict(module="wenshu_peichang_data", data=dict(key=name, index=index), proxies=True) items = start(**kwargs) log.crawler.info("获取的文件ID长度为:%d" % (len(items) - 1)) if len(items) == 1: break if items: run_eval = items[0]['RunEval'] else: break # monkey.patch_all() # pool = Pool(20) threads = [] for item in items[1:]: if threads: for t1 in threads: if not t1.is_alive(): threads.remove(t1) if len(threads) == 10: time.sleep(3) continue data = {} data["docid"] = item["文书ID"] data["CASE_TYPE"] = item["案件类型"] data["CASE_TIME"] = item["裁判日期"] data["CASE_NAME"] = item["案件名称"] data["CASE_NUM"] = item["案号"] data["COURT_NAME"] = item["法院名称"] data['runeval'] = run_eval d = dict(module="wenshu_peichang_detail", data=data, proxies=True) t = threading.Thread(target=start, kwargs=d, name=None) t.setDaemon(True) t.start() t.join() for t in threads: t.join() index += 1
def start_sougou(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("sougou_type").split(',')[0][2:-1] a = 0 for i in range(0, 20000): try: kwargs = dict(module="sougou_content", data=dict(type1=type2, i=i)) content = start(**kwargs) dicts = {"content": content} if content: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="sougou_save_data", data=dicts) # start(**details) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() else: a += 1 if a == 10: break except Exception as e: log.crawler.error(e)