def enterprise_list(**kwargs): k = kwargs.get("k", None) if not k: raise ValueError("k 参数存在错误......") item = hr.get_data_redis(k) if item: url = item item = ast.literal_eval(item) if isinstance(item, dict): url = item['url'] city = item['city'] prov = item['prov'] log.crawler.info("start crawler prov:%s,city:%s,url is:%s" % (prov, city, url)) module = "shunqi_list" kwargs = dict(module=module, data={ "url": url, "city": city, "prov": prov }) try: start(**kwargs) except Exception as err: k = "shunqi_detail" log.crawler.info("顺企网详细页面爬取发生异常将参数缓存到原来的key") hr.put_str_into_redis(k, item)
def multhread_shunqidetail(**kwargs): gid = kwargs.get("id") log.crawler.info("gevent is start id is:%d" % gid) k = "shunqi_detail" item = hr.get_data_redis(k) if item: url = item item = ast.literal_eval(item) if isinstance(item, dict): url = item['url'] city = item['city'] prov = item['prov'] log.crawler.info("start crawler prov:%s,city:%s,url is:%s" % (prov, city, url)) module = k kwargs = dict(module=module, data={ "url": url, "city": city, "prov": prov }) try: start(**kwargs) log.crawler.info("the gevent is finished id is:%d" % gid) except Exception as err: k = "shunqi_detail" log.crawler.info("顺企网详细页面爬取发生异常将参数缓存到原来的key") hr.put_str_into_redis(k, item) raise err
def start_hei(): r = RedisPool(client_db=1) rp = r.redis_pool() page = rp.get("hei_page") if not page: rp.set("hei_page", 1) page = 1 else: page = int(page.decode()) log.crawler.info("start hei shixin page is:%d" % page) rp.incrby("hei_page", 1) kwargs = dict(module="hei", data=dict(page=page)) start(**kwargs)
def get_detail_url(**kwargs): k = "shunqi_list" item = hr.get_data_redis(k) print(item) total = None if item: url = item item = ast.literal_eval(item) if isinstance(item, dict): url = item['url'] city = item['city'] prov = item['prov'] log.crawler.info("start crawler prov:%s,city:%s,url is:%s" % (prov, city, url)) module = k kwargs = dict(module=module, data={ "url": url, "city": city, "prov": prov }) try: total = start(**kwargs) log.crawler.info("total num is:%s" % total) except Exception as e: log.crawler.info("顺企网首页请求发生异常将参数缓存到原来的key") k = "shunqi_list" hr.put_str_into_redis(k, item) if total: if not isinstance(total, int): total = int(total) for i in range(2, total + 1): try: log.crawler.info("start crawler url is:%s,page is:%s" % (url, i)) kwargs = dict(module=module, data={ "url": url, "city": city, 'page': i, "prov": prov }) start(**kwargs) except Exception as e: log.crawler.info("发生异常将URL缓存到redis......") url = url.format(page=i) data = dict(city=city, url=url, prov=prov) k = "shunqi_list_error" hr.put_str_into_redis(k, data)
def start_yingyongbao(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("yingyongbao_types") t = type2.split(',')[0][2:-1] a = 0 for i in range(0, 2000): try: kwargs = dict(module="yingyongbao_data", data=dict(t=t, i=i)) dicts = start(**kwargs) if dicts: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="yingyongbao_save_details", data=dicts) # start(**details) comment = dict(module="yingyongbao_save_comment", data=dicts) # start(**comment) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() threading.Thread(target=start, kwargs=comment, name=None).start() else: a += 1 if a == 10: break except Exception as e: raise e
def baidu_shixin(**kwargs): """ 定时任务调用失信爬虫百度失信的爬取策略 :return: """ # ip_pool=get_proxies_from_redis() r = HandleRedis(1) name = r.get_data_redis("shixin_words") # flag为一个开关确定是否需要重新遍历关键词 # flag = r.get('baidushixin_flag') data = {} if name: name = name else: log.crawler.info('百度失信关键词遍历完毕.....') return try: pn = 0 hr = HandleRedis(7) while isinstance(pn, int): kwargs = dict(module='baidu', data=dict(name=name, pn=pn)) # if ip_pool: # proxies = random.choice(ip_pool) # else: # ip_pool = get_proxies_from_redis() # proxies = ip_pool.pop() # kwargs['data']['proxies'] = proxies log.crawler.info("crawler name is:{},pn is:{}".format(name, pn)) result_dict = start(**kwargs) qiye = result_dict['enterprise'] person = result_dict['person'] if qiye: hr.cache_list_redis('TB_SHIXIN_ENTERPRISE', qiye) log.crawler.info( "cache qiye shixin into redis success length is:%s" % len(qiye)) if person: hr.cache_list_redis('TB_SHIXIN_PERSON', person) log.crawler.info( "cache person shixin into redis success length is:%s" % len(person)) pn = result_dict["pn"] if pn == "finished": log.crawler.info("数据请求完毕name:{},pn:{}".format(name, pn)) break elif pn == 2000: break else: pn += 10 except Exception as err: log.error.info('百度失信爬虫发生异常,信息为:\n%s' % err)
def start_wenshu_peichang(**kwargs): r = HandleRedis(1) name = r.get_data_redis("wenshu_keys") if name: name = name print(name) else: log.crawler.info('裁判文书关键词遍历完毕.....') return index = 1 while True: log.crawler.info("*" * 80) log.crawler.info("start crawler wenshu page is:%d" % index) kwargs = dict(module="wenshu_peichang_data", data=dict(key=name, index=index), proxies=True) items = start(**kwargs) log.crawler.info("获取的文件ID长度为:%d" % (len(items) - 1)) if len(items) == 1: break if items: run_eval = items[0]['RunEval'] else: break # monkey.patch_all() # pool = Pool(20) threads = [] for item in items[1:]: if threads: for t1 in threads: if not t1.is_alive(): threads.remove(t1) if len(threads) == 10: time.sleep(3) continue data = {} data["docid"] = item["文书ID"] data["CASE_TYPE"] = item["案件类型"] data["CASE_TIME"] = item["裁判日期"] data["CASE_NAME"] = item["案件名称"] data["CASE_NUM"] = item["案号"] data["COURT_NAME"] = item["法院名称"] data['runeval'] = run_eval d = dict(module="wenshu_peichang_detail", data=data, proxies=True) t = threading.Thread(target=start, kwargs=d, name=None) t.setDaemon(True) t.start() t.join() for t in threads: t.join() index += 1
def start_sougou(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("sougou_type").split(',')[0][2:-1] a = 0 for i in range(0, 20000): try: kwargs = dict(module="sougou_content", data=dict(type1=type2, i=i)) content = start(**kwargs) dicts = {"content": content} if content: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="sougou_save_data", data=dicts) # start(**details) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() else: a += 1 if a == 10: break except Exception as e: log.crawler.error(e)