def get_mobile_phone(request): mongo_instance = MongoDB(MONGO_MOBILEBRAND_DB, MONGO_MOBILEBRAND_COLLECTIONS) try: update_time = request.GET.get("update_time") if update_time: update_time = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S.%f") else: update_time = datetime.now() - relativedelta(days=1) data_list = [] for data in mongo_instance.getAll(filter={"update_time": {"$gt": update_time}}, fields={"product_name": 1, "for_sale": 1, "brand_name": 1, "product_price": 1, "update_time": 1}): data["_id"] = "0x" + str(data["_id"]) data_list.append(data) except Exception: logger.exception("get_mobile_phone") return HttpResponseBadRequest("Bad Request!") else: return JsonResponse(data_list, safe=False) finally: mongo_instance.close()
def generate_query(self): """ 生成查询条件并存入到查询队列 :return: """ self.logger.info("query condition init begin!") # 查询法院(查询条件) mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_CHINACOURT_COLLECTIONS) # 设置游标不超时 cursor = mongo_instance.getAll(fields={ "_id": 1, "name": 1 }, sort=[("province", MONGO_ASCENDING)], no_cursor_timeout=True) court_list = [court["name"] for court in cursor] # 案件类型 case_type_list = ["1", "2", "3", "4", "5"] for court in court_list: count = 1 avg_interval = 0 # 当数量很大的时候直接使用总数/220的数字来代替间隔天数 avg_interval_first = 0 start_date = datetime.strptime(self.start_date, "%Y-%m-%d") end_date = datetime.strptime(self.end_date, "%Y-%m-%d") while True: divisor = (count**2) if count != 2 else 2 # 平均间隔天数 interval_day = avg_interval if avg_interval > 0 else ceil( (end_date - start_date).days / divisor) if avg_interval_first > 0: avg_interval = avg_interval_first avg_interval_first = 0 self.logger.info("interval_day->%s" % interval_day) # 生成查询时间段 end_date_temp = min(start_date + timedelta(days=interval_day), end_date) query_date = "%s TO %s" % (start_date.strftime("%Y-%m-%d"), end_date_temp.strftime("%Y-%m-%d")) self.logger.info("query_date->%s!" % query_date) query_condition = dict() query_condition["case_type"] = "0" # 所有类型 query_condition["court"] = court query_condition["date"] = query_date if self.is_query_condition_exists(query_condition): if end_date == end_date_temp: self.logger.info("%s query_condition exists!break!" % court) break else: start_date = end_date_temp + self.one_day self.logger.info( "%s query_condition exists!continue!" % json_dumps(query_condition)) continue # 查询到数量小于等于220的加到小于220的列表中,并跳出该循环 query_count = self.get_count_by_condition(court=court, date=query_date) if 0 <= query_count <= 220: if query_count > 0: self.record_query_condition(query_condition) self.push_query_condition_queue(query_condition) # 查询结果为0,只保存到mongo并且状态为-1 if query_count == 0: self.record_query_condition(query_condition, -1) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition(query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day else: if count > 1: avg_interval_first = avg_interval temp_days = (end_date_temp - start_date).days try: avg_interval = int(180 / (int(query_count) / temp_days)) except ZeroDivisionError: self.logger.exception("爬取出错,出错原因:") break # 如果间隔时间都为1天查询到的结果还大于220的话,则在保存条件的时候再增加案件类型进行保存 if temp_days == 1: for case_type in case_type_list: query_condition["case_type"] = case_type if not self.is_query_condition_exists( query_condition): self.record_query_condition(query_condition) self.push_query_condition_queue( query_condition) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition( query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day count += 1 self.logger.info("query condition init end!")
class WenshuSpider(NoticeClosedSpider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ssdb_conn = get_ssdb_conn() self.mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_WENSHU_CONDITION_COLLECTIONS) self.proxy_api = ProxyApi() self.proxy = self.proxy_api.get_proxy_one() self.pid = getpid() self.lock = Lock() self.logger.info("init pid->%d" % self.pid) def is_query_condition_exists(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) result = self.mongo_instance.getOne( filter={"condition": condition}, fields={ "condition": 1, "status": 1, "_id": 0 }) if result: return True except Exception: pass return False def record_query_condition(self, condition, status=0): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) item = { "condition": condition, "status": status, } self.mongo_instance.insertOne(item) except Exception: return def push_query_condition_queue(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition) except Exception: return def clear_query_condition(self): try: self.mongo_instance.deleteMany(filter={}) self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE) except Exception: return def get_wenshu_condition(self): try: return self.ssdb_conn.qpop_front(SSDB_WENSHU_CONDITION_QUEUE) except Exception: pass return {} def push_wenshu_condition(self, condition): try: self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition) except Exception: return def is_wenshu_id_exists(self, file_id): try: return self.ssdb_conn.hexists(SSDB_WENSHU_ID_HSET, file_id) except Exception: return True def record_wenshu_id_error(self, file_id): try: self.ssdb_conn.hset(SSDB_WENSHU_ID_ERROR_HSET, file_id, "") except Exception: return def reset_wenshu_condition(self): try: # 清空队列列表 self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE) # 将hset里面状态为0的插入到队列 cursor = self.mongo_instance.getAll(filter={"status": 0}, fields={ "condition": 1, "status": 1, "_id": 0 }) for item in cursor: self.push_query_condition_queue(item["condition"]) except Exception: pass return def exception_handle(self, condition, error_info): try: if self.name != "condition_spider": # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh" # 出现任何异常,再把出错的查询条件重新再加入到查询队列 self.push_wenshu_condition(condition) self.logger.info("parse or parse_doc error->%s" % str(error_info)) # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试 self.logger.info("sleep start!") sleep(5) # 暂停5秒钟 self.logger.info("sleep end!") # 更换代理 self.proxy = self.proxy_api.get_proxy_one() # 更换代理 self.logger.error("request retry") # 重新请求当前条件 request = Request(url=self.list_url, method='POST', callback=self.parse, body=json_dumps(self.req_data), headers=self.headers, dont_filter=True, errback=self.err_callback) self.set_proxy(request) yield request except Exception: self.exception_handle(condition, "change proxy error!") # os_system("kill -9 %d" % pid) # os_system("kill -9 %d && nohup /opt/test_wenshu/crawler/crawler_bqjr/%s >/dev/null 2>&1 &" % (pid, script_name)) # def start_wenshu_crawler(spider): # self.logger.info("begin new process") # process = CrawlerProcess(get_project_settings()) # process.crawl(spider) # process.start() # p = Process(target=start_wenshu_crawler, args=(self,)) # p.start() # # 获取pid并杀死进程,通过nohup再重启下爬虫 # pid = getpid() # self.logger.info("kill pid->%d" % pid) # kill(pid, 9) def exception_response(self, condition, response): if response.status != 200 \ or "/Html_Pages/VisitRemind.html" in response.text \ or response.text == "atZtw/muLK3OdYWrljShpg==": # 抓取文章出现任何异常,则把出错的信息加入到未抓取到的列表中方便以后查看或者重新采集 self.exception_handle(condition, "status code:" + str(response.status)) def dict_sorted(self, data): return sorted(data.items(), key=lambda t: len(t[0]), reverse=True) def closed(self, reason): if self.name != "condition_spider": self.lock.acquire() try: msg = super().closed(reason) self.logger.error("spider closed, pid->%d, reason->%s" % (self.pid, msg)) with open("count.txt", "r") as f: count = int(f.read()) count += 1 if count >= 20: with open("pid.txt", "r") as f: parent_pid = int(f.read()) self.logger.error( "kill pid->%d, parent pid->%d, restart now!" % (self.pid, parent_pid)) os_system( "nohup /opt/test_wenshu/crawler/crawler_bqjr/start_app.sh " ">/dev/null 2>&1 &") os_system("kill -9 %d" % self.pid) else: with open("count.txt", "w") as f: f.write(str(count)) self.logger.error("kill pid->%d" % self.pid) os_system("kill -9 %d" % self.pid) except Exception: self.logger.error("kill pid or restart error!") self.lock.release() def err_callback(self, failure): self.logger.error("request error->%s" % repr(failure)) if self.name in ["wenshu_pc_spider", "wenshu_app_spider"]: self.exception_handle(self.condition, "request failure, change proxy!")
class ProxyApi(object): def __init__(self): self.mongo_instance = MongoDB(MONGO_PROXY_DB, MONGO_PROXY_COLLECTIONS) # 线程安全并带连接池 self.scheme_filter_dict = { SchemeType.HTTP: { "$ne": SchemeType.HTTPS }, SchemeType.HTTPS: { "$ne": SchemeType.HTTP }, SchemeType.HTTP_OR_HTTPS: { "$eq": SchemeType.HTTP_OR_HTTPS }, } self.method_filter_dict = { SupportMethod.GET: { "$ne": SupportMethod.POST }, SupportMethod.POST: { "$ne": SupportMethod.GET }, SupportMethod.GET_OR_POST: { "$eq": SupportMethod.GET_OR_POST }, } self.good_quality_dict = { SchemeType.HTTP: { "$gte": len(HTTP_CHECK_URL_LIST) }, SchemeType.HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, SchemeType.HTTP_OR_HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, } self.good_response_time_dict = { SchemeType.HTTP: { "$lt": 1, "$gte": 0 }, SchemeType.HTTPS: { "$lt": 3, "$gte": 0 }, SchemeType.HTTP_OR_HTTPS: { "$lt": 1, "$gte": 0 }, } def close(self): self.mongo_instance.close() def get_proxy_from_kuaidaili(self, stable_time=StableTime.MIN_10): try: url = "http://dps.kuaidaili.com/api/getdps/?" \ "orderid=959308673589451&num=50&sep=2&ut=" + str(stable_time) resp = http_get(url) if resp.status_code != 200: raise NoProxyException return resp.text.split() except Exception: from traceback import print_exc print_exc() raise NoProxyException def get_proxy_all(self, location=None, anonymous=AnonymousLevel.MIDDLE, scheme=SchemeType.HTTP, method=SupportMethod.GET): the_filter = { "quality": self.good_quality_dict[scheme], "response_time": self.good_response_time_dict[scheme], # "anonymous_level": {"$lte": anonymous}, "scheme_type": self.scheme_filter_dict[scheme], # "support_method": self.method_filter_dict[method], } # if location: # the_filter["location"] = re_compile(".*" + location + ".*") cursor = self.mongo_instance.getAll(filter=the_filter, fields={ "ip": 1, "port": 1, "_id": 0 }, sort=[("response_time", MONGO_ASCENDING)]) return [item["ip"] + ":" + str(item["port"]) for item in cursor] def get_proxy_one(self, location=None, anonymous=AnonymousLevel.MIDDLE, scheme=SchemeType.HTTP, method=SupportMethod.GET, stable_time=StableTime.MIN_10): good_proxys = self.get_proxy_all(location, anonymous, scheme, method) if good_proxys: return rand_choice(good_proxys) else: raise NoProxyException