def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ssdb_conn = get_ssdb_conn() self.mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_WENSHU_CONDITION_COLLECTIONS) self.proxy_api = ProxyApi() self.proxy = self.proxy_api.get_proxy_one() self.pid = getpid() self.lock = Lock() self.logger.info("init pid->%d" % self.pid)
def get_search_good_word_request(self): _get_search_request = self._get_search_request filter_set = set(w for w in self.citys if len(w) == 2) filter_set.update(w for w in self.first_names if len(w) == 2) # 常用字两字排列 good_words = self.good_words.copy() for word1, word2 in product(good_words, good_words): name = word1 + word2 if name not in filter_set: filter_set.add(name) yield _get_search_request(name, 0) if self.good_names: good_first_name = set(self.first_names) & set(self.good_words) good_names = self.good_names.copy() for word1, word2 in product(good_first_name, good_names): name = word1 + word2 filter_set.add(name) yield _get_search_request(name, 0) good_name_word = set(self.good_words) - set(self.first_names) for word1, word2 in product(good_names, good_name_word): name = word1 + word2 filter_set.add(name) yield _get_search_request(name, 0) # 法院公布名单 with MongoDB(MONGO_SHIXIN_DB, MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance: name_set = { i["name"] for i in mongo_instance.getAll(fields={ "name": 1, "_id": 0 }) if len(i["name"]) < 5 } # 被执行人名单 with MongoDB(MONGO_SHIXIN_DB, MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance: name_set.update(i["name"] for i in mongo_instance.getAll(fields={ "name": 1, "_id": 0 }) if len(i["name"]) < 5) name_set -= filter_set self.logger.info("name_set length: %d" % len(name_set)) for name in name_set: yield _get_search_request(name, 0)
def __init__(self): self.mongo_instance = MongoDB(MONGO_PROXY_DB, MONGO_PROXY_COLLECTIONS) # 线程安全并带连接池 self.scheme_filter_dict = { SchemeType.HTTP: { "$ne": SchemeType.HTTPS }, SchemeType.HTTPS: { "$ne": SchemeType.HTTP }, SchemeType.HTTP_OR_HTTPS: { "$eq": SchemeType.HTTP_OR_HTTPS }, } self.method_filter_dict = { SupportMethod.GET: { "$ne": SupportMethod.POST }, SupportMethod.POST: { "$ne": SupportMethod.GET }, SupportMethod.GET_OR_POST: { "$eq": SupportMethod.GET_OR_POST }, } self.good_quality_dict = { SchemeType.HTTP: { "$gte": len(HTTP_CHECK_URL_LIST) }, SchemeType.HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, SchemeType.HTTP_OR_HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, } self.good_response_time_dict = { SchemeType.HTTP: { "$lt": 1, "$gte": 0 }, SchemeType.HTTPS: { "$lt": 3, "$gte": 0 }, SchemeType.HTTP_OR_HTTPS: { "$lt": 1, "$gte": 0 }, }
def get_search_request(self): ssdb_conn = get_ssdb_conn() mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS) _add_proxy = self._add_proxy parse_search = self.parse_search name_set = self.name_set while True: company = get_one_company(mongo_instance, ssdb_conn) if company is not None: company_name = company["name"] if company_name in name_set: continue form_data = { "action": "getSSDJBList", "keyword": company_name, "PageIndex": "1", } request = FormRequest( "http://app03.szmqs.gov.cn/xyjggs.webui/xyjggs/Ajax/Ajax.ashx", parse_search, dont_filter=True, formdata=form_data) request.meta["company_other_info"] = company _add_proxy(request) yield request else: yield Request(DO_NOTHING_URL, self.do_nothing, errback=self.do_nothing, dont_filter=True)
def del_duplicate_zhixing(): file_code_set = set() duplicate_ids = [] with MongoDB(MONGO_SHIXIN_DB, MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={"link_id": 1}, sort=[("_id", MONGO_DESCENDING)]): try: file_code = item["link_id"] if file_code not in file_code_set: file_code_set.add(file_code) else: duplicate_ids.append(item["_id"]) except Exception: print_exc() del file_code_set for the_id in duplicate_ids: mongo_instance.deleteOne(filter={"_id": the_id}) count_zhixing_id(mongo_instance) print("Del %d of duplicated item in collection[%s]" % (len(duplicate_ids), MONGO_ZHIXING_DETAIL_COLLECTIONS)) del duplicate_ids
def check_proxy_usable(): del_duplicate_proxy(MONGO_PROXY_COLLECTIONS) # proxy_set = del_duplicate_proxy(MONGO_GOOD_PROXY_COLLECTIONS) mongo_instance_spider = MongoDB(MONGO_PROXY_DB, MONGO_PROXY_COLLECTIONS) # mongo_instance_kuaidaili = MongoDB(MONGO_PROXY_DB, MONGO_GOOD_PROXY_COLLECTIONS) while True: try: _check_proxy_usable(mongo_instance_spider) # _check_proxy_usable(mongo_instance_kuaidaili) # for proxy in get_proxy_from_kuaidaili(): # if proxy in proxy_set: # continue # # ip, other = proxy.split(":", 1) # port, location = other.split(",", 1) # item = {"ip": ip, # "port": int(port), # "location": location, # "response_time": -1, # "fail_times": 0, # "ok_times": 0, # "quality": 0, # } # proxy_set.add(proxy) # mongo_instance_kuaidaili.insertOne(item) except Exception: print_exc()
def del_duplicate_shixinlist(): id_set = set() duplicate_ids = [] with MongoDB(MONGO_SHIXIN_DB, MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={ "id": 1, "name": 1 }, sort=[("_id", MONGO_DESCENDING)]): try: the_id = item["name"] + item["id"] if the_id not in id_set: id_set.add(the_id) else: duplicate_ids.append(item["_id"]) except Exception: print_exc() del id_set for the_id in duplicate_ids: mongo_instance.deleteOne(filter={"_id": the_id}) print("Del %d of duplicated item in collection[%s]" % (len(duplicate_ids), MONGO_SHIXIN_LIST_COLLECTIONS)) del duplicate_ids
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # self.proxy_api = ProxyApi() self.WEBSITE_BUSY_STR = "过于频繁,请稍" with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance: self.name_set = set(item["name"] for item in mongo_instance.getAll(fields={ "name": 1, "_id": 0 }))
def push_all_company_id(): with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance: finished = set(item["name"] for item in mongo_instance.getAll(fields={ "name": 1, "_id": 0 })) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance: finished.update(item["name"] for item in mongo_instance.getAll(fields={ "name": 1, "_id": 0 })) ssdb_conn = get_ssdb_conn() ssdb_conn.qclear(SSDB_COMPANY_QUEUE_NAME) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={ "name": 1, "_id": 1 }, filter={ "$or": [{ "area": "shenzhen" }, { "name": re_compile(r".*深圳.*") }] }, sort=[("_id", MONGO_DESCENDING)]): name = item["name"] if name not in finished: ssdb_conn.qpush_back(SSDB_COMPANY_QUEUE_NAME, str(item["_id"])) ssdb_conn.close() del finished print("push_all_company_id done.")
def get_mobile_phone(request): mongo_instance = MongoDB(MONGO_MOBILEBRAND_DB, MONGO_MOBILEBRAND_COLLECTIONS) try: update_time = request.GET.get("update_time") if update_time: update_time = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S.%f") else: update_time = datetime.now() - relativedelta(days=1) data_list = [] for data in mongo_instance.getAll(filter={"update_time": {"$gt": update_time}}, fields={"product_name": 1, "for_sale": 1, "brand_name": 1, "product_price": 1, "update_time": 1}): data["_id"] = "0x" + str(data["_id"]) data_list.append(data) except Exception: logger.exception("get_mobile_phone") return HttpResponseBadRequest("Bad Request!") else: return JsonResponse(data_list, safe=False) finally: mongo_instance.close()
def record_all_zhixing_id(): ssdb_conn = get_ssdb_conn() ssdb_conn.hclear(SSDB_ZHIXING_ID_HSET_NAME) with MongoDB(MONGO_SHIXIN_DB, MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={"link_id": 1, "_id": 0}): try: ssdb_conn.hset(SSDB_ZHIXING_ID_HSET_NAME, item["link_id"], "") except Exception: print_exc() ssdb_conn.close() print("record_all_zhixing_id done.")
def push_all_tianyancha_company_id(): ssdb_conn = get_ssdb_conn() ssdb_conn.qclear(SSDB_TIANYANCHA_QUEUE_NAME) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll( fields={"_id": 1}, filter={ "search_url": re_compile(r"^http://www\.tianyancha\.com/company/") }, sort=[("_id", MONGO_DESCENDING)]): ssdb_conn.qpush_back(SSDB_TIANYANCHA_QUEUE_NAME, str(item["_id"])) ssdb_conn.close() print("push_all_tianyancha_company_id done.")
def record_all_company_name(): ssdb_conn = get_ssdb_conn() ssdb_conn.hclear(SSDB_COMPANY_HSET_NAME) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={"name": 1, "_id": 0}): try: # 将爬取过的公司名单加入SSDB,用于避免重复爬取 name = item["name"] if len(name) < 60: ssdb_conn.hset(SSDB_COMPANY_HSET_NAME, name, "") except Exception: print_exc() ssdb_conn.close() print("record_all_company_name done.")
def _del_duplicate_company(collections=MONGO_COMPANY_DETAIL_COLLECTIONS): name_set = set() duplicate_ids = [] with MongoDB(MONGO_COMPANY_DB, collections) as mongo_instance: for item in mongo_instance.getAll(fields={"name": 1}, sort=[("_id", MONGO_DESCENDING)]): name = item["name"] if name not in name_set: name_set.add(name) else: duplicate_ids.append(item["_id"]) for the_id in duplicate_ids: mongo_instance.deleteOne(filter={"_id": the_id}) del name_set print("Del %d of duplicated item in collection[%s]" % (len(duplicate_ids), collections))
def record_all_shixin_id(): ssdb_conn = get_ssdb_conn() ssdb_conn.hclear(SSDB_SHIXIN_ID_HSET_NAME) with MongoDB(MONGO_SHIXIN_DB, MONGO_SHIXIN_DETAIL_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={ "from_web": 1, "link_id": 1, "_id": 0 }): try: the_id = item["from_web"] + "_" + str(item["link_id"]) ssdb_conn.hset(SSDB_SHIXIN_ID_HSET_NAME, the_id, "") except Exception: print_exc() ssdb_conn.close() print("record_all_shixin_id done.")
def record_all_shixinlist_id(): ssdb_conn = get_ssdb_conn() ssdb_conn.hclear(SSDB_SHIXIN_LIST_ID_HSET_NAME) with MongoDB(MONGO_SHIXIN_DB, MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance: for item in mongo_instance.getAll(fields={ "id": 1, "name": 1, "_id": 0 }): try: the_id = item["name"] + item["id"] ssdb_conn.hset(SSDB_SHIXIN_LIST_ID_HSET_NAME, the_id, "") except Exception: print_exc() ssdb_conn.close() print("record_all_shixin_list_id done.")
def del_duplicate_proxy(collection): proxy_unique_set = set() duplicate_ids = [] with MongoDB(MONGO_PROXY_DB, collection) as mongo_instance: for item in mongo_instance.getAll(fields={"ip": 1, "port": 1}): try: proxy_unique = item["ip"] + ":" + str(item["port"]) if proxy_unique not in proxy_unique_set: proxy_unique_set.add(proxy_unique) else: duplicate_ids.append(item["_id"]) except Exception: print_exc() for the_id in duplicate_ids: mongo_instance.deleteOne(filter={"_id": the_id}) print("Del %d of duplicated item in collection[%s]" % (len(duplicate_ids), collection)) del duplicate_ids return proxy_unique_set
def start_requests(self): ssdb_conn = get_ssdb_conn() mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS) parse_search = self.parse_search name_set = self.name_set while True: company = get_one_company(mongo_instance, ssdb_conn) if company is not None: company_name = company["name"] if company_name in name_set: continue # form_data = {"userName": company_name # } # request = FormRequest("http://qy.58.com/ajax/getBusinessInfo", # parse_search, dont_filter=True, formdata=form_data) request = Request("http://qy.58.com/ajax/getBusinessInfo?userName="******"company_other_info"] = company yield request else: yield Request(DO_NOTHING_URL, self.do_nothing, errback=self.do_nothing, dont_filter=True)
def __init__(self, item_class, mongo_db, mongo_collection): self.item_class = item_class self.mongo_instance = MongoDB(mongo_db, mongo_collection) self.key = b"zhegemiyaobeininadaoyemeiyouyong"
class MongoPipelineUtils(object): def __init__(self, item_class, mongo_db, mongo_collection): self.item_class = item_class self.mongo_instance = MongoDB(mongo_db, mongo_collection) self.key = b"zhegemiyaobeininadaoyemeiyouyong" def encrypt(self, text): key = self.key cryptor = AES_new(key, MODE_EAX, key) return b64encode(cryptor.encrypt(text.encode())) def strip_insert_item(self, item): return { k: (v.strip(BLANK_CHARS) if isinstance(v, str) else v) for k, v in item.items() } def write_item_to_db(self, item): # 保存html源文为文件 if "html" in item: html = item.pop("html") html_hash = md5(html.encode()).hexdigest() filename = os_path.join(HTML_DIR, html_hash + ".html") with open(filename, "w", encoding="utf-8") as f: f.write(html) item["html_file"] = html_hash insert_dict = self.strip_insert_item(item) result = self.mongo_instance.insertOne(insert_dict) return str(result.inserted_id) def process_item(self, item, spider): item_class = self.item_class if type(item) is item_class: try: if "password" in item: item["password"] = self.encrypt(item["password"]) self.write_item_to_db(item) except Exception: spider.logger.exception("%s write item(%s) to db error: " % (spider.name, item)) raise DropItem("Processing %s item done." % item_class.__name__) else: return item def all_data_2_string(self, data_dict): new_dict = {} for k, v in data_dict: if type(v) in [str, none_type]: data = v elif isinstance(v, dict): data = self.all_data_2_string(v) elif type(v) in [list, tuple]: data = [self.all_data_2_string(i) for i in v] elif isinstance(v, bytes): data = v.decode() else: data = str(v) new_dict[k] = data return new_dict def rabbitmq_sender(self, queue, item_dict): """ 对保存mq的内容进行gzip压缩和base64位编码 :param queue: 队列名 :return: """ del item_dict["_id"] content = b64encode(compress( json_dumps(item_dict).encode("utf-8"))).decode("utf-8") with RabbitmqSender(queue=queue, exchange=RABBITMQ_EXCHANGE, durable=True) as rs: rs.send(content)
class WenshuSpider(NoticeClosedSpider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ssdb_conn = get_ssdb_conn() self.mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_WENSHU_CONDITION_COLLECTIONS) self.proxy_api = ProxyApi() self.proxy = self.proxy_api.get_proxy_one() self.pid = getpid() self.lock = Lock() self.logger.info("init pid->%d" % self.pid) def is_query_condition_exists(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) result = self.mongo_instance.getOne( filter={"condition": condition}, fields={ "condition": 1, "status": 1, "_id": 0 }) if result: return True except Exception: pass return False def record_query_condition(self, condition, status=0): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) item = { "condition": condition, "status": status, } self.mongo_instance.insertOne(item) except Exception: return def push_query_condition_queue(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition) except Exception: return def clear_query_condition(self): try: self.mongo_instance.deleteMany(filter={}) self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE) except Exception: return def get_wenshu_condition(self): try: return self.ssdb_conn.qpop_front(SSDB_WENSHU_CONDITION_QUEUE) except Exception: pass return {} def push_wenshu_condition(self, condition): try: self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition) except Exception: return def is_wenshu_id_exists(self, file_id): try: return self.ssdb_conn.hexists(SSDB_WENSHU_ID_HSET, file_id) except Exception: return True def record_wenshu_id_error(self, file_id): try: self.ssdb_conn.hset(SSDB_WENSHU_ID_ERROR_HSET, file_id, "") except Exception: return def reset_wenshu_condition(self): try: # 清空队列列表 self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE) # 将hset里面状态为0的插入到队列 cursor = self.mongo_instance.getAll(filter={"status": 0}, fields={ "condition": 1, "status": 1, "_id": 0 }) for item in cursor: self.push_query_condition_queue(item["condition"]) except Exception: pass return def exception_handle(self, condition, error_info): try: if self.name != "condition_spider": # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh" # 出现任何异常,再把出错的查询条件重新再加入到查询队列 self.push_wenshu_condition(condition) self.logger.info("parse or parse_doc error->%s" % str(error_info)) # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试 self.logger.info("sleep start!") sleep(5) # 暂停5秒钟 self.logger.info("sleep end!") # 更换代理 self.proxy = self.proxy_api.get_proxy_one() # 更换代理 self.logger.error("request retry") # 重新请求当前条件 request = Request(url=self.list_url, method='POST', callback=self.parse, body=json_dumps(self.req_data), headers=self.headers, dont_filter=True, errback=self.err_callback) self.set_proxy(request) yield request except Exception: self.exception_handle(condition, "change proxy error!") # os_system("kill -9 %d" % pid) # os_system("kill -9 %d && nohup /opt/test_wenshu/crawler/crawler_bqjr/%s >/dev/null 2>&1 &" % (pid, script_name)) # def start_wenshu_crawler(spider): # self.logger.info("begin new process") # process = CrawlerProcess(get_project_settings()) # process.crawl(spider) # process.start() # p = Process(target=start_wenshu_crawler, args=(self,)) # p.start() # # 获取pid并杀死进程,通过nohup再重启下爬虫 # pid = getpid() # self.logger.info("kill pid->%d" % pid) # kill(pid, 9) def exception_response(self, condition, response): if response.status != 200 \ or "/Html_Pages/VisitRemind.html" in response.text \ or response.text == "atZtw/muLK3OdYWrljShpg==": # 抓取文章出现任何异常,则把出错的信息加入到未抓取到的列表中方便以后查看或者重新采集 self.exception_handle(condition, "status code:" + str(response.status)) def dict_sorted(self, data): return sorted(data.items(), key=lambda t: len(t[0]), reverse=True) def closed(self, reason): if self.name != "condition_spider": self.lock.acquire() try: msg = super().closed(reason) self.logger.error("spider closed, pid->%d, reason->%s" % (self.pid, msg)) with open("count.txt", "r") as f: count = int(f.read()) count += 1 if count >= 20: with open("pid.txt", "r") as f: parent_pid = int(f.read()) self.logger.error( "kill pid->%d, parent pid->%d, restart now!" % (self.pid, parent_pid)) os_system( "nohup /opt/test_wenshu/crawler/crawler_bqjr/start_app.sh " ">/dev/null 2>&1 &") os_system("kill -9 %d" % self.pid) else: with open("count.txt", "w") as f: f.write(str(count)) self.logger.error("kill pid->%d" % self.pid) os_system("kill -9 %d" % self.pid) except Exception: self.logger.error("kill pid or restart error!") self.lock.release() def err_callback(self, failure): self.logger.error("request error->%s" % repr(failure)) if self.name in ["wenshu_pc_spider", "wenshu_app_spider"]: self.exception_handle(self.condition, "request failure, change proxy!")
def update_name_words(): from collections import defaultdict from itertools import islice, chain from data_storage.db_settings import MONGO_COMPANY_DB, MONGO_SHIXIN_DB, \ MONGO_COMPANY_DETAIL_COLLECTIONS, MONGO_COMPANY_DETAIL3_COLLECTIONS, \ MONGO_SHIXIN_DETAIL_COLLECTIONS, MONGO_ZHIXING_DETAIL_COLLECTIONS, \ MONGO_P2P_DEADBEAT_COLLECTIONS from data_storage.mongo_db import MongoDB name_words = get_name_words() hanzi_start_ord = ord("\u4E00") hanzi_end_ord = ord("\u9FA5") first_names = name_words.first_names rare_words = set(i for i in name_words.rare_words if hanzi_start_ord <= ord(i) <= hanzi_end_ord) long_first_names = [w for w in first_names if len(w) > 1] single_first_names = [w for w in first_names if len(w) == 1] first_name_stat = defaultdict(int) word_stat = defaultdict(int) parse_count = 0 def name_parse(name): if not name or len(name) > 6: return nonlocal parse_count parse_count += 1 first_name = name[0] index = 1 for i in long_first_names: if name.startswith(i): first_name = i index = 2 break else: for i in single_first_names: if name.startswith(i): first_name = i index = 1 break if index == 2 or hanzi_start_ord <= ord(first_name) <= hanzi_end_ord: first_name_stat[first_name] += 1 for w in islice(name, index, None): if hanzi_start_ord <= ord(w) <= hanzi_end_ord: word_stat[w] += 1 def company_detail(mongo_instance): for i in mongo_instance.getAll(fields={ "legal_person": 1, "member_info": 1, "shareholder_info": 1, "_id": 0 }): name_set = set() name_set.add(i.get("legal_person")) name_set.update(j[0] for j in i.get("member_info", [])) name_set.update(j[0] for j in i.get("shareholder_info", [])) for j in name_set: name_parse(j) del name_set def shixin(mongo_instance): for item in mongo_instance.getAll(fields={"name": 1, "_id": 0}): try: name_parse(item["name"]) except Exception: continue with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance: company_detail(mongo_instance) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL3_COLLECTIONS) as mongo_instance: company_detail(mongo_instance) with MongoDB(MONGO_SHIXIN_DB, MONGO_SHIXIN_DETAIL_COLLECTIONS) as mongo_instance: shixin(mongo_instance) with MongoDB(MONGO_SHIXIN_DB, MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance: shixin(mongo_instance) with MongoDB(MONGO_SHIXIN_DB, MONGO_P2P_DEADBEAT_COLLECTIONS) as mongo_instance: shixin(mongo_instance) for k in ['钅', "亻", "扌", "犭"]: try: del first_name_stat[k] except Exception: pass try: del word_stat[k] except Exception: pass try: rare_words.remove(k) except Exception: pass # print(sorted(first_name_stat.items(), key=itemgetter(1), reverse=True)) # print(sorted(word_stat.items(), key=itemgetter(1), reverse=True)) # 处理姓氏 new_common_first_names = [] new_rare_first_names = [] threshold = int(parse_count * 1E-4) for i in first_name_stat: if first_name_stat[i] > threshold: new_common_first_names.append(i) else: new_rare_first_names.append(i) new_rare_first_names.extend(i for i in first_names if i not in first_name_stat and len(i) > 1) def _print_words(words_list, stat_dict): ret_list = sorted(words_list, key=lambda a: stat_dict[a], reverse=True) for word in ret_list: print(word, stat_dict[word], sep=":", end=", ") print() print("".center(100, "-")) return ret_list new_common_first_names = _print_words(new_common_first_names, first_name_stat) new_rare_first_names = _print_words(new_rare_first_names, first_name_stat) # 处理名字 all_first_names = new_common_first_names + new_rare_first_names # for i in all_first_names: # 删除已经存在于姓氏中的字 # try: # del word_stat[i] # except Exception: # pass new_popular_words = [] new_common_words = [] threshold1 = int(parse_count * 1E-4) threshold2 = int(parse_count * 4E-6) for i in word_stat: count = word_stat[i] if count > threshold1: new_popular_words.append(i) elif count > threshold2: new_common_words.append(i) else: rare_words.add(i) new_most_words = new_popular_words + new_common_words rare_words -= set(chain(all_first_names, new_most_words)) # rare_words &= word_stat.keys() new_popular_words = _print_words(new_popular_words, word_stat) new_common_words = _print_words(new_common_words, word_stat) new_rare_words = _print_words(rare_words, word_stat) name_words.common_first_names = new_common_first_names name_words.rare_first_names = new_rare_first_names name_words.first_names = all_first_names name_words.popular_words = new_popular_words name_words.common_words = new_common_words name_words.most_words = new_most_words name_words.rare_words = new_rare_words with open(NAME_WORDS_FILE_NAME, "wb") as f: dump(name_words, f)
form_data = parse_qs(urlsplit(response.url).query) old_captcha_code = form_data["j_captcha"][0] if old_captcha_code == self.captcha_code: self.captcha_code = self.get_captcha_code(response) form_data_new = { "id": form_data["id"][0], "j_captcha": self.captcha_code, "captchaId": self.captcha_id, } yield Request("http://zhixing.court.gov.cn/search/newdetail?" + urlencode(form_data_new), self.parse_item, dont_filter=True, meta=response.meta, errback=self.err_callback) else: data = json_loads(text) item["id"] = data.get("partyCardNum", "") item["execution_court"] = data.get("execCourtName") item["execution_money"] = data.get("execMoney") yield item except Exception: self.logger.exception("text(%s) url(%s)" % (text, response.url)) if __name__ == '__main__': with MongoDB(MONGO_SHIXIN_DB, MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance: count_zhixing_id(mongo_instance)
def parse(self, response): ssdb_conn = get_ssdb_conn() mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS) company = "" # time_start = time() while True: try: company = get_one_company(mongo_instance, ssdb_conn) if company is not None: # DetailGSXTSpider.scrapy_count += 1 company_name = company["name"] driver = self.__getwebdriver__() self.driver = driver self.logger.info("正在爬取公司:%s" % company_name) self.wait_20 = self.__getwait_20__() self.wait_10 = self.__getwait_10__() driver.get(response.url) # with open("webpage.html", "w",encoding='utf-8') as file: # file.write(driver.page_source) self.wait_20.until(lambda d: d.find_element_by_xpath( "//input[@id='keyword']").is_displayed()) # 关键词输入框 keyword_input = driver.find_element_by_id("keyword") keyword_input.send_keys(company_name) # 点击查询按钮 submit_btn = driver.find_element_by_id("btn_query") submit_btn.click() # 如果重试3次依然抛出TimeoutException则跳过此次查询 try_counts = 3 while True: try: self.wait_20.until( lambda d: d.find_element_by_xpath( "//div[@class='gt_cut_bg gt_show']"). is_displayed()) break except Exception: submit_btn.click() try_counts -= 1 if try_counts == 0: break if try_counts == 0: continue hack = GeetestHack(driver, self.wait_10, self.logger) is_successful = hack.drag_and_move_slider( "//div[@class='gt_cut_bg gt_show']", "//div[@class='gt_cut_fullbg gt_show']", "//div[@class='gt_cut_bg gt_show']" "/div[@class='gt_cut_bg_slice']", "//div[@class='gt_cut_fullbg gt_show']" "/div[@class='gt_cut_fullbg_slice']", "//div[@class='gt_slider_knob gt_show']", "//a[@class='search_list_item db']") tries = 5 if not is_successful: sleep(2) try: while True: self.wait_20.until( lambda the_driver: the_driver. find_element_by_xpath( "//div[@class='gt_cut_bg gt_show']" ).is_displayed()) hack.drag_and_move_slider( "//div[@class='gt_cut_bg gt_show']", "//div[@class='gt_cut_fullbg gt_show']", "//div[@class='gt_cut_bg gt_show']" "/div[@class='gt_cut_bg_slice']", "//div[@class='gt_cut_fullbg gt_show']" "/div[@class='gt_cut_fullbg_slice']", "//div[@class='gt_slider_knob gt_show']", "//a[@class='search_list_item db']") if tries == 0: break tries -= 1 sleep(0.8) except Exception as e: self.logger.warning("爬取异常:{message:%s}" % str(e)) if tries == 0: # 查询公司失败,继续查下一个公司 self.logger.debug("验证码破解失败,公司名:%s" % company_name) continue try: # 查询公司成功,返回公司信息数据 company_list = driver.find_elements_by_xpath( "//a[@class='search_list_item db']") if company_list: company_link = company_list[0].get_attribute( "href") driver.get(company_link) self.wait_10.until( lambda d: d.find_element_by_xpath( "//div[@id='primaryInfo']" "/div[@class='details " "clearfix']").is_displayed()) response = HtmlResponse(driver.current_url, encoding="utf-8", body=driver.page_source) yield self.parse_search(company_name, response) except Exception: self.logger.info("爬取异常:国家企业信用信息公示系统没有%s的相关信息" % company_name) else: yield Request(DO_NOTHING_URL, self.do_nothing, errback=self.do_nothing, dont_filter=True) except Exception as e: self.logger.warning("爬取异常:{company: %s,message:%s}" % (company, str(e))) finally: if hasattr(self, 'driver') and self.driver is not None: self.driver.quit()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance: self.name_set = set(item["name"] for item in mongo_instance.getAll(fields={"name": 1, "_id": 0}))
class DetailTianyanchaSpider(CompanySpider): name = "tianyancha" allowed_domains = ["tianyancha.com"] start_urls = ["http://www.tianyancha.com/"] custom_settings = { 'DOWNLOAD_DELAY': 3, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS) def _get_one_company(self): while True: _id = self.ssdb_conn.qpop_front(SSDB_TIANYANCHA_QUEUE_NAME) if _id is not None: a_company = self.mongo_instance.getOne( filter={"_id": ObjectId(_id)}) if a_company is not None: a_company.pop("_id") return a_company else: return None def start_requests(self): parse_company_name = self.parse_company_name _get_one_company = self._get_one_company tianyancha_id_pattern = re_compile(r"(\d+)$") while True: a_company = _get_one_company() if a_company is not None: try: tianyancha_id = tianyancha_id_pattern.search( a_company["search_url"]).group(1) a_company["tianyancha_id"] = tianyancha_id request = Request( "http://www.tianyancha.com/near/s.json?id=%s" % tianyancha_id, parse_company_name) request.meta["company_other_info"] = a_company yield request except Exception: self.logger.error("No tianyancha_id url(%s)" % a_company["search_url"]) else: yield Request(DO_NOTHING_URL, self.do_nothing, errback=self.do_nothing, dont_filter=True) def parse(self, response): meta = response.meta tianyancha_id = meta["company_other_info"]["tianyancha_id"] yield Request("http://www.tianyancha.com/company/%d.json" % tianyancha_id, self.parse_company, meta=response.meta) def parse_company_name(self, response): try: text = response.text if '"state":"ok"' in text: # 成功 spider_name = self.name name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name datas = json_loads(text)["data"] if "items" in datas: for data in datas["items"]: name = data["name"] if not name: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item[ "from_url"] = "http://www.tianyancha.com/company/" + data[ "id"] item["area"] = "shenzhen" item["name"] = name yield item else: self.logger.warning("天眼查---查找相关公司失败,URL(%s)" % response.url) except Exception: self.logger.exception("天眼查---查找相关公司异常,URL(%s)" % response.url) def parse_company(self, response): try: text = response.text if '"state":"ok"' in text: # 成功 datas = json_loads(text) pass else: self.logger.error("天眼查---搜索公司失败,URL(%s)" % response.url) except Exception: self.logger.exception("天眼查---搜索公司异常,URL(%s)" % response.url)
def generate_query(self): """ 生成查询条件并存入到查询队列 :return: """ self.logger.info("query condition init begin!") # 查询法院(查询条件) mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_CHINACOURT_COLLECTIONS) # 设置游标不超时 cursor = mongo_instance.getAll(fields={ "_id": 1, "name": 1 }, sort=[("province", MONGO_ASCENDING)], no_cursor_timeout=True) court_list = [court["name"] for court in cursor] # 案件类型 case_type_list = ["1", "2", "3", "4", "5"] for court in court_list: count = 1 avg_interval = 0 # 当数量很大的时候直接使用总数/220的数字来代替间隔天数 avg_interval_first = 0 start_date = datetime.strptime(self.start_date, "%Y-%m-%d") end_date = datetime.strptime(self.end_date, "%Y-%m-%d") while True: divisor = (count**2) if count != 2 else 2 # 平均间隔天数 interval_day = avg_interval if avg_interval > 0 else ceil( (end_date - start_date).days / divisor) if avg_interval_first > 0: avg_interval = avg_interval_first avg_interval_first = 0 self.logger.info("interval_day->%s" % interval_day) # 生成查询时间段 end_date_temp = min(start_date + timedelta(days=interval_day), end_date) query_date = "%s TO %s" % (start_date.strftime("%Y-%m-%d"), end_date_temp.strftime("%Y-%m-%d")) self.logger.info("query_date->%s!" % query_date) query_condition = dict() query_condition["case_type"] = "0" # 所有类型 query_condition["court"] = court query_condition["date"] = query_date if self.is_query_condition_exists(query_condition): if end_date == end_date_temp: self.logger.info("%s query_condition exists!break!" % court) break else: start_date = end_date_temp + self.one_day self.logger.info( "%s query_condition exists!continue!" % json_dumps(query_condition)) continue # 查询到数量小于等于220的加到小于220的列表中,并跳出该循环 query_count = self.get_count_by_condition(court=court, date=query_date) if 0 <= query_count <= 220: if query_count > 0: self.record_query_condition(query_condition) self.push_query_condition_queue(query_condition) # 查询结果为0,只保存到mongo并且状态为-1 if query_count == 0: self.record_query_condition(query_condition, -1) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition(query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day else: if count > 1: avg_interval_first = avg_interval temp_days = (end_date_temp - start_date).days try: avg_interval = int(180 / (int(query_count) / temp_days)) except ZeroDivisionError: self.logger.exception("爬取出错,出错原因:") break # 如果间隔时间都为1天查询到的结果还大于220的话,则在保存条件的时候再增加案件类型进行保存 if temp_days == 1: for case_type in case_type_list: query_condition["case_type"] = case_type if not self.is_query_condition_exists( query_condition): self.record_query_condition(query_condition) self.push_query_condition_queue( query_condition) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition( query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day count += 1 self.logger.info("query condition init end!")
class ProxyApi(object): def __init__(self): self.mongo_instance = MongoDB(MONGO_PROXY_DB, MONGO_PROXY_COLLECTIONS) # 线程安全并带连接池 self.scheme_filter_dict = { SchemeType.HTTP: { "$ne": SchemeType.HTTPS }, SchemeType.HTTPS: { "$ne": SchemeType.HTTP }, SchemeType.HTTP_OR_HTTPS: { "$eq": SchemeType.HTTP_OR_HTTPS }, } self.method_filter_dict = { SupportMethod.GET: { "$ne": SupportMethod.POST }, SupportMethod.POST: { "$ne": SupportMethod.GET }, SupportMethod.GET_OR_POST: { "$eq": SupportMethod.GET_OR_POST }, } self.good_quality_dict = { SchemeType.HTTP: { "$gte": len(HTTP_CHECK_URL_LIST) }, SchemeType.HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, SchemeType.HTTP_OR_HTTPS: { "$gte": len(HTTPS_CHECK_URL_LIST) }, } self.good_response_time_dict = { SchemeType.HTTP: { "$lt": 1, "$gte": 0 }, SchemeType.HTTPS: { "$lt": 3, "$gte": 0 }, SchemeType.HTTP_OR_HTTPS: { "$lt": 1, "$gte": 0 }, } def close(self): self.mongo_instance.close() def get_proxy_from_kuaidaili(self, stable_time=StableTime.MIN_10): try: url = "http://dps.kuaidaili.com/api/getdps/?" \ "orderid=959308673589451&num=50&sep=2&ut=" + str(stable_time) resp = http_get(url) if resp.status_code != 200: raise NoProxyException return resp.text.split() except Exception: from traceback import print_exc print_exc() raise NoProxyException def get_proxy_all(self, location=None, anonymous=AnonymousLevel.MIDDLE, scheme=SchemeType.HTTP, method=SupportMethod.GET): the_filter = { "quality": self.good_quality_dict[scheme], "response_time": self.good_response_time_dict[scheme], # "anonymous_level": {"$lte": anonymous}, "scheme_type": self.scheme_filter_dict[scheme], # "support_method": self.method_filter_dict[method], } # if location: # the_filter["location"] = re_compile(".*" + location + ".*") cursor = self.mongo_instance.getAll(filter=the_filter, fields={ "ip": 1, "port": 1, "_id": 0 }, sort=[("response_time", MONGO_ASCENDING)]) return [item["ip"] + ":" + str(item["port"]) for item in cursor] def get_proxy_one(self, location=None, anonymous=AnonymousLevel.MIDDLE, scheme=SchemeType.HTTP, method=SupportMethod.GET, stable_time=StableTime.MIN_10): good_proxys = self.get_proxy_all(location, anonymous, scheme, method) if good_proxys: return rand_choice(good_proxys) else: raise NoProxyException