예제 #1
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.ssdb_conn = get_ssdb_conn()
     self.mongo_instance = MongoDB(MONGO_WENSHU_DB,
                                   MONGO_WENSHU_CONDITION_COLLECTIONS)
     self.proxy_api = ProxyApi()
     self.proxy = self.proxy_api.get_proxy_one()
     self.pid = getpid()
     self.lock = Lock()
     self.logger.info("init pid->%d" % self.pid)
예제 #2
0
    def get_search_good_word_request(self):
        _get_search_request = self._get_search_request

        filter_set = set(w for w in self.citys if len(w) == 2)
        filter_set.update(w for w in self.first_names if len(w) == 2)

        # 常用字两字排列
        good_words = self.good_words.copy()
        for word1, word2 in product(good_words, good_words):
            name = word1 + word2
            if name not in filter_set:
                filter_set.add(name)
                yield _get_search_request(name, 0)

        if self.good_names:
            good_first_name = set(self.first_names) & set(self.good_words)
            good_names = self.good_names.copy()
            for word1, word2 in product(good_first_name, good_names):
                name = word1 + word2
                filter_set.add(name)
                yield _get_search_request(name, 0)

            good_name_word = set(self.good_words) - set(self.first_names)
            for word1, word2 in product(good_names, good_name_word):
                name = word1 + word2
                filter_set.add(name)
                yield _get_search_request(name, 0)

        # 法院公布名单
        with MongoDB(MONGO_SHIXIN_DB,
                     MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance:
            name_set = {
                i["name"]
                for i in mongo_instance.getAll(fields={
                    "name": 1,
                    "_id": 0
                }) if len(i["name"]) < 5
            }

        # 被执行人名单
        with MongoDB(MONGO_SHIXIN_DB,
                     MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance:
            name_set.update(i["name"] for i in mongo_instance.getAll(fields={
                "name": 1,
                "_id": 0
            }) if len(i["name"]) < 5)

        name_set -= filter_set
        self.logger.info("name_set length: %d" % len(name_set))
        for name in name_set:
            yield _get_search_request(name, 0)
예제 #3
0
 def __init__(self):
     self.mongo_instance = MongoDB(MONGO_PROXY_DB,
                                   MONGO_PROXY_COLLECTIONS)  # 线程安全并带连接池
     self.scheme_filter_dict = {
         SchemeType.HTTP: {
             "$ne": SchemeType.HTTPS
         },
         SchemeType.HTTPS: {
             "$ne": SchemeType.HTTP
         },
         SchemeType.HTTP_OR_HTTPS: {
             "$eq": SchemeType.HTTP_OR_HTTPS
         },
     }
     self.method_filter_dict = {
         SupportMethod.GET: {
             "$ne": SupportMethod.POST
         },
         SupportMethod.POST: {
             "$ne": SupportMethod.GET
         },
         SupportMethod.GET_OR_POST: {
             "$eq": SupportMethod.GET_OR_POST
         },
     }
     self.good_quality_dict = {
         SchemeType.HTTP: {
             "$gte": len(HTTP_CHECK_URL_LIST)
         },
         SchemeType.HTTPS: {
             "$gte": len(HTTPS_CHECK_URL_LIST)
         },
         SchemeType.HTTP_OR_HTTPS: {
             "$gte": len(HTTPS_CHECK_URL_LIST)
         },
     }
     self.good_response_time_dict = {
         SchemeType.HTTP: {
             "$lt": 1,
             "$gte": 0
         },
         SchemeType.HTTPS: {
             "$lt": 3,
             "$gte": 0
         },
         SchemeType.HTTP_OR_HTTPS: {
             "$lt": 1,
             "$gte": 0
         },
     }
예제 #4
0
    def get_search_request(self):
        ssdb_conn = get_ssdb_conn()
        mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS)
        _add_proxy = self._add_proxy
        parse_search = self.parse_search
        name_set = self.name_set
        while True:
            company = get_one_company(mongo_instance, ssdb_conn)
            if company is not None:
                company_name = company["name"]
                if company_name in name_set:
                    continue

                form_data = {
                    "action": "getSSDJBList",
                    "keyword": company_name,
                    "PageIndex": "1",
                }
                request = FormRequest(
                    "http://app03.szmqs.gov.cn/xyjggs.webui/xyjggs/Ajax/Ajax.ashx",
                    parse_search,
                    dont_filter=True,
                    formdata=form_data)
                request.meta["company_other_info"] = company
                _add_proxy(request)
                yield request
            else:
                yield Request(DO_NOTHING_URL,
                              self.do_nothing,
                              errback=self.do_nothing,
                              dont_filter=True)
예제 #5
0
def del_duplicate_zhixing():
    file_code_set = set()
    duplicate_ids = []
    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={"link_id": 1},
                                          sort=[("_id", MONGO_DESCENDING)]):
            try:
                file_code = item["link_id"]
                if file_code not in file_code_set:
                    file_code_set.add(file_code)
                else:
                    duplicate_ids.append(item["_id"])
            except Exception:
                print_exc()
        del file_code_set

        for the_id in duplicate_ids:
            mongo_instance.deleteOne(filter={"_id": the_id})

        count_zhixing_id(mongo_instance)

    print("Del %d of duplicated item in collection[%s]" %
          (len(duplicate_ids), MONGO_ZHIXING_DETAIL_COLLECTIONS))
    del duplicate_ids
예제 #6
0
def check_proxy_usable():
    del_duplicate_proxy(MONGO_PROXY_COLLECTIONS)
    # proxy_set = del_duplicate_proxy(MONGO_GOOD_PROXY_COLLECTIONS)

    mongo_instance_spider = MongoDB(MONGO_PROXY_DB, MONGO_PROXY_COLLECTIONS)
    # mongo_instance_kuaidaili = MongoDB(MONGO_PROXY_DB, MONGO_GOOD_PROXY_COLLECTIONS)

    while True:
        try:
            _check_proxy_usable(mongo_instance_spider)

            # _check_proxy_usable(mongo_instance_kuaidaili)
            # for proxy in get_proxy_from_kuaidaili():
            #     if proxy in proxy_set:
            #         continue
            #
            #     ip, other = proxy.split(":", 1)
            #     port, location = other.split(",", 1)
            #     item = {"ip": ip,
            #             "port": int(port),
            #             "location": location,
            #             "response_time": -1,
            #             "fail_times": 0,
            #             "ok_times": 0,
            #             "quality": 0,
            #             }
            #     proxy_set.add(proxy)
            #     mongo_instance_kuaidaili.insertOne(item)
        except Exception:
            print_exc()
예제 #7
0
def del_duplicate_shixinlist():
    id_set = set()
    duplicate_ids = []
    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={
                "id": 1,
                "name": 1
        },
                                          sort=[("_id", MONGO_DESCENDING)]):
            try:
                the_id = item["name"] + item["id"]
                if the_id not in id_set:
                    id_set.add(the_id)
                else:
                    duplicate_ids.append(item["_id"])
            except Exception:
                print_exc()
        del id_set

        for the_id in duplicate_ids:
            mongo_instance.deleteOne(filter={"_id": the_id})

    print("Del %d of duplicated item in collection[%s]" %
          (len(duplicate_ids), MONGO_SHIXIN_LIST_COLLECTIONS))
    del duplicate_ids
예제 #8
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     # self.proxy_api = ProxyApi()
     self.WEBSITE_BUSY_STR = "过于频繁,请稍"
     with MongoDB(MONGO_COMPANY_DB,
                  MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance:
         self.name_set = set(item["name"]
                             for item in mongo_instance.getAll(fields={
                                 "name": 1,
                                 "_id": 0
                             }))
예제 #9
0
def push_all_company_id():
    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance:
        finished = set(item["name"] for item in mongo_instance.getAll(fields={
            "name": 1,
            "_id": 0
        }))

    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance:
        finished.update(item["name"] for item in mongo_instance.getAll(fields={
            "name": 1,
            "_id": 0
        }))

    ssdb_conn = get_ssdb_conn()
    ssdb_conn.qclear(SSDB_COMPANY_QUEUE_NAME)

    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={
                "name": 1,
                "_id": 1
        },
                                          filter={
                                              "$or": [{
                                                  "area": "shenzhen"
                                              }, {
                                                  "name":
                                                  re_compile(r".*深圳.*")
                                              }]
                                          },
                                          sort=[("_id", MONGO_DESCENDING)]):
            name = item["name"]
            if name not in finished:
                ssdb_conn.qpush_back(SSDB_COMPANY_QUEUE_NAME, str(item["_id"]))

    ssdb_conn.close()
    del finished

    print("push_all_company_id done.")
예제 #10
0
def get_mobile_phone(request):
    mongo_instance = MongoDB(MONGO_MOBILEBRAND_DB, MONGO_MOBILEBRAND_COLLECTIONS)
    try:
        update_time = request.GET.get("update_time")
        if update_time:
            update_time = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S.%f")
        else:
            update_time = datetime.now() - relativedelta(days=1)
        data_list = []
        for data in mongo_instance.getAll(filter={"update_time": {"$gt": update_time}},
                                          fields={"product_name": 1, "for_sale": 1,
                                                  "brand_name": 1, "product_price": 1,
                                                  "update_time": 1}):
            data["_id"] = "0x" + str(data["_id"])
            data_list.append(data)
    except Exception:
        logger.exception("get_mobile_phone")
        return HttpResponseBadRequest("Bad Request!")
    else:
        return JsonResponse(data_list, safe=False)
    finally:
        mongo_instance.close()
예제 #11
0
def record_all_zhixing_id():
    ssdb_conn = get_ssdb_conn()
    ssdb_conn.hclear(SSDB_ZHIXING_ID_HSET_NAME)

    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={"link_id": 1, "_id": 0}):
            try:
                ssdb_conn.hset(SSDB_ZHIXING_ID_HSET_NAME, item["link_id"], "")
            except Exception:
                print_exc()

    ssdb_conn.close()

    print("record_all_zhixing_id done.")
예제 #12
0
def push_all_tianyancha_company_id():
    ssdb_conn = get_ssdb_conn()
    ssdb_conn.qclear(SSDB_TIANYANCHA_QUEUE_NAME)

    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(
                fields={"_id": 1},
                filter={
                    "search_url":
                    re_compile(r"^http://www\.tianyancha\.com/company/")
                },
                sort=[("_id", MONGO_DESCENDING)]):
            ssdb_conn.qpush_back(SSDB_TIANYANCHA_QUEUE_NAME, str(item["_id"]))

    ssdb_conn.close()

    print("push_all_tianyancha_company_id done.")
예제 #13
0
def record_all_company_name():
    ssdb_conn = get_ssdb_conn()
    ssdb_conn.hclear(SSDB_COMPANY_HSET_NAME)

    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={"name": 1, "_id": 0}):
            try:
                # 将爬取过的公司名单加入SSDB,用于避免重复爬取
                name = item["name"]
                if len(name) < 60:
                    ssdb_conn.hset(SSDB_COMPANY_HSET_NAME, name, "")
            except Exception:
                print_exc()

    ssdb_conn.close()

    print("record_all_company_name done.")
예제 #14
0
def _del_duplicate_company(collections=MONGO_COMPANY_DETAIL_COLLECTIONS):
    name_set = set()
    duplicate_ids = []
    with MongoDB(MONGO_COMPANY_DB, collections) as mongo_instance:
        for item in mongo_instance.getAll(fields={"name": 1},
                                          sort=[("_id", MONGO_DESCENDING)]):
            name = item["name"]
            if name not in name_set:
                name_set.add(name)
            else:
                duplicate_ids.append(item["_id"])

        for the_id in duplicate_ids:
            mongo_instance.deleteOne(filter={"_id": the_id})

    del name_set

    print("Del %d of duplicated item in collection[%s]" %
          (len(duplicate_ids), collections))
예제 #15
0
def record_all_shixin_id():
    ssdb_conn = get_ssdb_conn()
    ssdb_conn.hclear(SSDB_SHIXIN_ID_HSET_NAME)

    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_SHIXIN_DETAIL_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={
                "from_web": 1,
                "link_id": 1,
                "_id": 0
        }):
            try:
                the_id = item["from_web"] + "_" + str(item["link_id"])
                ssdb_conn.hset(SSDB_SHIXIN_ID_HSET_NAME, the_id, "")
            except Exception:
                print_exc()

    ssdb_conn.close()

    print("record_all_shixin_id done.")
예제 #16
0
def record_all_shixinlist_id():
    ssdb_conn = get_ssdb_conn()
    ssdb_conn.hclear(SSDB_SHIXIN_LIST_ID_HSET_NAME)

    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_SHIXIN_LIST_COLLECTIONS) as mongo_instance:
        for item in mongo_instance.getAll(fields={
                "id": 1,
                "name": 1,
                "_id": 0
        }):
            try:
                the_id = item["name"] + item["id"]
                ssdb_conn.hset(SSDB_SHIXIN_LIST_ID_HSET_NAME, the_id, "")
            except Exception:
                print_exc()

    ssdb_conn.close()

    print("record_all_shixin_list_id done.")
예제 #17
0
def del_duplicate_proxy(collection):
    proxy_unique_set = set()
    duplicate_ids = []
    with MongoDB(MONGO_PROXY_DB, collection) as mongo_instance:
        for item in mongo_instance.getAll(fields={"ip": 1, "port": 1}):
            try:
                proxy_unique = item["ip"] + ":" + str(item["port"])
                if proxy_unique not in proxy_unique_set:
                    proxy_unique_set.add(proxy_unique)
                else:
                    duplicate_ids.append(item["_id"])
            except Exception:
                print_exc()

        for the_id in duplicate_ids:
            mongo_instance.deleteOne(filter={"_id": the_id})

    print("Del %d of duplicated item in collection[%s]" %
          (len(duplicate_ids), collection))
    del duplicate_ids

    return proxy_unique_set
예제 #18
0
    def start_requests(self):
        ssdb_conn = get_ssdb_conn()
        mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS)
        parse_search = self.parse_search
        name_set = self.name_set
        while True:
            company = get_one_company(mongo_instance, ssdb_conn)
            if company is not None:
                company_name = company["name"]
                if company_name in name_set:
                    continue

                # form_data = {"userName": company_name
                #              }
                # request = FormRequest("http://qy.58.com/ajax/getBusinessInfo",
                #                       parse_search, dont_filter=True, formdata=form_data)
                request = Request("http://qy.58.com/ajax/getBusinessInfo?userName="******"company_other_info"] = company
                yield request
            else:
                yield Request(DO_NOTHING_URL, self.do_nothing,
                              errback=self.do_nothing, dont_filter=True)
예제 #19
0
 def __init__(self, item_class, mongo_db, mongo_collection):
     self.item_class = item_class
     self.mongo_instance = MongoDB(mongo_db, mongo_collection)
     self.key = b"zhegemiyaobeininadaoyemeiyouyong"
예제 #20
0
class MongoPipelineUtils(object):
    def __init__(self, item_class, mongo_db, mongo_collection):
        self.item_class = item_class
        self.mongo_instance = MongoDB(mongo_db, mongo_collection)
        self.key = b"zhegemiyaobeininadaoyemeiyouyong"

    def encrypt(self, text):
        key = self.key
        cryptor = AES_new(key, MODE_EAX, key)
        return b64encode(cryptor.encrypt(text.encode()))

    def strip_insert_item(self, item):
        return {
            k: (v.strip(BLANK_CHARS) if isinstance(v, str) else v)
            for k, v in item.items()
        }

    def write_item_to_db(self, item):
        # 保存html源文为文件
        if "html" in item:
            html = item.pop("html")
            html_hash = md5(html.encode()).hexdigest()
            filename = os_path.join(HTML_DIR, html_hash + ".html")
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html)
            item["html_file"] = html_hash

        insert_dict = self.strip_insert_item(item)
        result = self.mongo_instance.insertOne(insert_dict)
        return str(result.inserted_id)

    def process_item(self, item, spider):
        item_class = self.item_class
        if type(item) is item_class:
            try:
                if "password" in item:
                    item["password"] = self.encrypt(item["password"])
                self.write_item_to_db(item)
            except Exception:
                spider.logger.exception("%s write item(%s) to db error: " %
                                        (spider.name, item))
            raise DropItem("Processing %s item done." % item_class.__name__)
        else:
            return item

    def all_data_2_string(self, data_dict):
        new_dict = {}

        for k, v in data_dict:
            if type(v) in [str, none_type]:
                data = v
            elif isinstance(v, dict):
                data = self.all_data_2_string(v)
            elif type(v) in [list, tuple]:
                data = [self.all_data_2_string(i) for i in v]
            elif isinstance(v, bytes):
                data = v.decode()
            else:
                data = str(v)
            new_dict[k] = data

        return new_dict

    def rabbitmq_sender(self, queue, item_dict):
        """
        对保存mq的内容进行gzip压缩和base64位编码
        :param queue: 队列名
        :return:
        """
        del item_dict["_id"]
        content = b64encode(compress(
            json_dumps(item_dict).encode("utf-8"))).decode("utf-8")
        with RabbitmqSender(queue=queue,
                            exchange=RABBITMQ_EXCHANGE,
                            durable=True) as rs:
            rs.send(content)
예제 #21
0
class WenshuSpider(NoticeClosedSpider):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ssdb_conn = get_ssdb_conn()
        self.mongo_instance = MongoDB(MONGO_WENSHU_DB,
                                      MONGO_WENSHU_CONDITION_COLLECTIONS)
        self.proxy_api = ProxyApi()
        self.proxy = self.proxy_api.get_proxy_one()
        self.pid = getpid()
        self.lock = Lock()
        self.logger.info("init pid->%d" % self.pid)

    def is_query_condition_exists(self, condition):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            result = self.mongo_instance.getOne(
                filter={"condition": condition},
                fields={
                    "condition": 1,
                    "status": 1,
                    "_id": 0
                })
            if result:
                return True
        except Exception:
            pass
        return False

    def record_query_condition(self, condition, status=0):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            item = {
                "condition": condition,
                "status": status,
            }
            self.mongo_instance.insertOne(item)
        except Exception:
            return

    def push_query_condition_queue(self, condition):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition)
        except Exception:
            return

    def clear_query_condition(self):
        try:
            self.mongo_instance.deleteMany(filter={})
            self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE)
        except Exception:
            return

    def get_wenshu_condition(self):
        try:
            return self.ssdb_conn.qpop_front(SSDB_WENSHU_CONDITION_QUEUE)
        except Exception:
            pass
        return {}

    def push_wenshu_condition(self, condition):
        try:
            self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition)
        except Exception:
            return

    def is_wenshu_id_exists(self, file_id):
        try:
            return self.ssdb_conn.hexists(SSDB_WENSHU_ID_HSET, file_id)
        except Exception:
            return True

    def record_wenshu_id_error(self, file_id):
        try:
            self.ssdb_conn.hset(SSDB_WENSHU_ID_ERROR_HSET, file_id, "")
        except Exception:
            return

    def reset_wenshu_condition(self):
        try:
            # 清空队列列表
            self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE)
            # 将hset里面状态为0的插入到队列
            cursor = self.mongo_instance.getAll(filter={"status": 0},
                                                fields={
                                                    "condition": 1,
                                                    "status": 1,
                                                    "_id": 0
                                                })
            for item in cursor:
                self.push_query_condition_queue(item["condition"])
        except Exception:
            pass
        return

    def exception_handle(self, condition, error_info):
        try:
            if self.name != "condition_spider":
                # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh"
                # 出现任何异常,再把出错的查询条件重新再加入到查询队列
                self.push_wenshu_condition(condition)
                self.logger.info("parse or parse_doc error->%s" %
                                 str(error_info))
                # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试
                self.logger.info("sleep start!")
                sleep(5)  # 暂停5秒钟
                self.logger.info("sleep end!")
                # 更换代理
                self.proxy = self.proxy_api.get_proxy_one()  # 更换代理
                self.logger.error("request retry")
                # 重新请求当前条件
                request = Request(url=self.list_url,
                                  method='POST',
                                  callback=self.parse,
                                  body=json_dumps(self.req_data),
                                  headers=self.headers,
                                  dont_filter=True,
                                  errback=self.err_callback)
                self.set_proxy(request)
                yield request
        except Exception:
            self.exception_handle(condition, "change proxy error!")

            # os_system("kill -9 %d" % pid)
            # os_system("kill -9 %d && nohup /opt/test_wenshu/crawler/crawler_bqjr/%s >/dev/null 2>&1 &" % (pid, script_name))
            # def start_wenshu_crawler(spider):
            #     self.logger.info("begin new process")
            #     process = CrawlerProcess(get_project_settings())
            #     process.crawl(spider)
            #     process.start()
            # p = Process(target=start_wenshu_crawler, args=(self,))
            # p.start()
            # # 获取pid并杀死进程,通过nohup再重启下爬虫
            # pid = getpid()
            # self.logger.info("kill pid->%d" % pid)
            # kill(pid, 9)

    def exception_response(self, condition, response):
        if response.status != 200 \
                or "/Html_Pages/VisitRemind.html" in response.text \
                or response.text == "atZtw/muLK3OdYWrljShpg==":
            # 抓取文章出现任何异常,则把出错的信息加入到未抓取到的列表中方便以后查看或者重新采集
            self.exception_handle(condition,
                                  "status code:" + str(response.status))

    def dict_sorted(self, data):
        return sorted(data.items(), key=lambda t: len(t[0]), reverse=True)

    def closed(self, reason):
        if self.name != "condition_spider":
            self.lock.acquire()
            try:
                msg = super().closed(reason)
                self.logger.error("spider closed, pid->%d, reason->%s" %
                                  (self.pid, msg))
                with open("count.txt", "r") as f:
                    count = int(f.read())
                count += 1
                if count >= 20:
                    with open("pid.txt", "r") as f:
                        parent_pid = int(f.read())
                    self.logger.error(
                        "kill pid->%d, parent pid->%d, restart now!" %
                        (self.pid, parent_pid))
                    os_system(
                        "nohup /opt/test_wenshu/crawler/crawler_bqjr/start_app.sh "
                        ">/dev/null 2>&1 &")
                    os_system("kill -9 %d" % self.pid)
                else:
                    with open("count.txt", "w") as f:
                        f.write(str(count))
                    self.logger.error("kill pid->%d" % self.pid)
                    os_system("kill -9 %d" % self.pid)
            except Exception:
                self.logger.error("kill pid or restart error!")
            self.lock.release()

    def err_callback(self, failure):
        self.logger.error("request error->%s" % repr(failure))
        if self.name in ["wenshu_pc_spider", "wenshu_app_spider"]:
            self.exception_handle(self.condition,
                                  "request failure, change proxy!")
예제 #22
0
def update_name_words():
    from collections import defaultdict
    from itertools import islice, chain
    from data_storage.db_settings import MONGO_COMPANY_DB, MONGO_SHIXIN_DB, \
        MONGO_COMPANY_DETAIL_COLLECTIONS, MONGO_COMPANY_DETAIL3_COLLECTIONS, \
        MONGO_SHIXIN_DETAIL_COLLECTIONS, MONGO_ZHIXING_DETAIL_COLLECTIONS, \
        MONGO_P2P_DEADBEAT_COLLECTIONS
    from data_storage.mongo_db import MongoDB

    name_words = get_name_words()
    hanzi_start_ord = ord("\u4E00")
    hanzi_end_ord = ord("\u9FA5")

    first_names = name_words.first_names
    rare_words = set(i for i in name_words.rare_words
                     if hanzi_start_ord <= ord(i) <= hanzi_end_ord)

    long_first_names = [w for w in first_names if len(w) > 1]
    single_first_names = [w for w in first_names if len(w) == 1]

    first_name_stat = defaultdict(int)
    word_stat = defaultdict(int)
    parse_count = 0

    def name_parse(name):
        if not name or len(name) > 6:
            return

        nonlocal parse_count

        parse_count += 1
        first_name = name[0]
        index = 1
        for i in long_first_names:
            if name.startswith(i):
                first_name = i
                index = 2
                break
        else:
            for i in single_first_names:
                if name.startswith(i):
                    first_name = i
                    index = 1
                    break

        if index == 2 or hanzi_start_ord <= ord(first_name) <= hanzi_end_ord:
            first_name_stat[first_name] += 1

        for w in islice(name, index, None):
            if hanzi_start_ord <= ord(w) <= hanzi_end_ord:
                word_stat[w] += 1

    def company_detail(mongo_instance):
        for i in mongo_instance.getAll(fields={
                "legal_person": 1,
                "member_info": 1,
                "shareholder_info": 1,
                "_id": 0
        }):
            name_set = set()
            name_set.add(i.get("legal_person"))
            name_set.update(j[0] for j in i.get("member_info", []))
            name_set.update(j[0] for j in i.get("shareholder_info", []))
            for j in name_set:
                name_parse(j)
            del name_set

    def shixin(mongo_instance):
        for item in mongo_instance.getAll(fields={"name": 1, "_id": 0}):
            try:
                name_parse(item["name"])
            except Exception:
                continue

    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_DETAIL_COLLECTIONS) as mongo_instance:
        company_detail(mongo_instance)
    with MongoDB(MONGO_COMPANY_DB,
                 MONGO_COMPANY_DETAIL3_COLLECTIONS) as mongo_instance:
        company_detail(mongo_instance)

    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_SHIXIN_DETAIL_COLLECTIONS) as mongo_instance:
        shixin(mongo_instance)
    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance:
        shixin(mongo_instance)
    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_P2P_DEADBEAT_COLLECTIONS) as mongo_instance:
        shixin(mongo_instance)

    for k in ['钅', "亻", "扌", "犭"]:
        try:
            del first_name_stat[k]
        except Exception:
            pass
        try:
            del word_stat[k]
        except Exception:
            pass
        try:
            rare_words.remove(k)
        except Exception:
            pass

    # print(sorted(first_name_stat.items(), key=itemgetter(1), reverse=True))
    # print(sorted(word_stat.items(), key=itemgetter(1), reverse=True))

    # 处理姓氏
    new_common_first_names = []
    new_rare_first_names = []
    threshold = int(parse_count * 1E-4)
    for i in first_name_stat:
        if first_name_stat[i] > threshold:
            new_common_first_names.append(i)
        else:
            new_rare_first_names.append(i)
    new_rare_first_names.extend(i for i in first_names
                                if i not in first_name_stat and len(i) > 1)

    def _print_words(words_list, stat_dict):
        ret_list = sorted(words_list, key=lambda a: stat_dict[a], reverse=True)
        for word in ret_list:
            print(word, stat_dict[word], sep=":", end=", ")
        print()
        print("".center(100, "-"))
        return ret_list

    new_common_first_names = _print_words(new_common_first_names,
                                          first_name_stat)
    new_rare_first_names = _print_words(new_rare_first_names, first_name_stat)

    # 处理名字
    all_first_names = new_common_first_names + new_rare_first_names
    # for i in all_first_names:  # 删除已经存在于姓氏中的字
    #     try:
    #         del word_stat[i]
    #     except Exception:
    #         pass

    new_popular_words = []
    new_common_words = []
    threshold1 = int(parse_count * 1E-4)
    threshold2 = int(parse_count * 4E-6)
    for i in word_stat:
        count = word_stat[i]
        if count > threshold1:
            new_popular_words.append(i)
        elif count > threshold2:
            new_common_words.append(i)
        else:
            rare_words.add(i)

    new_most_words = new_popular_words + new_common_words
    rare_words -= set(chain(all_first_names, new_most_words))
    # rare_words &= word_stat.keys()

    new_popular_words = _print_words(new_popular_words, word_stat)
    new_common_words = _print_words(new_common_words, word_stat)
    new_rare_words = _print_words(rare_words, word_stat)

    name_words.common_first_names = new_common_first_names
    name_words.rare_first_names = new_rare_first_names
    name_words.first_names = all_first_names
    name_words.popular_words = new_popular_words
    name_words.common_words = new_common_words
    name_words.most_words = new_most_words
    name_words.rare_words = new_rare_words

    with open(NAME_WORDS_FILE_NAME, "wb") as f:
        dump(name_words, f)
예제 #23
0
                form_data = parse_qs(urlsplit(response.url).query)
                old_captcha_code = form_data["j_captcha"][0]
                if old_captcha_code == self.captcha_code:
                    self.captcha_code = self.get_captcha_code(response)

                form_data_new = {
                    "id": form_data["id"][0],
                    "j_captcha": self.captcha_code,
                    "captchaId": self.captcha_id,
                }
                yield Request("http://zhixing.court.gov.cn/search/newdetail?" +
                              urlencode(form_data_new),
                              self.parse_item,
                              dont_filter=True,
                              meta=response.meta,
                              errback=self.err_callback)
            else:
                data = json_loads(text)
                item["id"] = data.get("partyCardNum", "")
                item["execution_court"] = data.get("execCourtName")
                item["execution_money"] = data.get("execMoney")
                yield item
        except Exception:
            self.logger.exception("text(%s) url(%s)" % (text, response.url))


if __name__ == '__main__':
    with MongoDB(MONGO_SHIXIN_DB,
                 MONGO_ZHIXING_DETAIL_COLLECTIONS) as mongo_instance:
        count_zhixing_id(mongo_instance)
예제 #24
0
    def parse(self, response):
        ssdb_conn = get_ssdb_conn()
        mongo_instance = MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_COLLECTIONS)
        company = ""
        # time_start = time()
        while True:
            try:
                company = get_one_company(mongo_instance, ssdb_conn)
                if company is not None:
                    # DetailGSXTSpider.scrapy_count += 1
                    company_name = company["name"]
                    driver = self.__getwebdriver__()
                    self.driver = driver
                    self.logger.info("正在爬取公司:%s" % company_name)
                    self.wait_20 = self.__getwait_20__()
                    self.wait_10 = self.__getwait_10__()
                    driver.get(response.url)
                    # with open("webpage.html", "w",encoding='utf-8') as file:
                    #     file.write(driver.page_source)

                    self.wait_20.until(lambda d: d.find_element_by_xpath(
                        "//input[@id='keyword']").is_displayed())

                    # 关键词输入框
                    keyword_input = driver.find_element_by_id("keyword")
                    keyword_input.send_keys(company_name)

                    # 点击查询按钮
                    submit_btn = driver.find_element_by_id("btn_query")
                    submit_btn.click()

                    # 如果重试3次依然抛出TimeoutException则跳过此次查询
                    try_counts = 3
                    while True:
                        try:
                            self.wait_20.until(
                                lambda d: d.find_element_by_xpath(
                                    "//div[@class='gt_cut_bg gt_show']").
                                is_displayed())
                            break
                        except Exception:
                            submit_btn.click()
                            try_counts -= 1
                            if try_counts == 0:
                                break
                    if try_counts == 0:
                        continue
                    hack = GeetestHack(driver, self.wait_10, self.logger)
                    is_successful = hack.drag_and_move_slider(
                        "//div[@class='gt_cut_bg gt_show']",
                        "//div[@class='gt_cut_fullbg gt_show']",
                        "//div[@class='gt_cut_bg gt_show']"
                        "/div[@class='gt_cut_bg_slice']",
                        "//div[@class='gt_cut_fullbg gt_show']"
                        "/div[@class='gt_cut_fullbg_slice']",
                        "//div[@class='gt_slider_knob gt_show']",
                        "//a[@class='search_list_item db']")
                    tries = 5
                    if not is_successful:
                        sleep(2)
                        try:
                            while True:
                                self.wait_20.until(
                                    lambda the_driver: the_driver.
                                    find_element_by_xpath(
                                        "//div[@class='gt_cut_bg gt_show']"
                                    ).is_displayed())
                                hack.drag_and_move_slider(
                                    "//div[@class='gt_cut_bg gt_show']",
                                    "//div[@class='gt_cut_fullbg gt_show']",
                                    "//div[@class='gt_cut_bg gt_show']"
                                    "/div[@class='gt_cut_bg_slice']",
                                    "//div[@class='gt_cut_fullbg gt_show']"
                                    "/div[@class='gt_cut_fullbg_slice']",
                                    "//div[@class='gt_slider_knob gt_show']",
                                    "//a[@class='search_list_item db']")
                                if tries == 0:
                                    break
                                tries -= 1
                                sleep(0.8)
                        except Exception as e:
                            self.logger.warning("爬取异常:{message:%s}" % str(e))
                    if tries == 0:
                        # 查询公司失败,继续查下一个公司
                        self.logger.debug("验证码破解失败,公司名:%s" % company_name)
                        continue
                    try:
                        # 查询公司成功,返回公司信息数据
                        company_list = driver.find_elements_by_xpath(
                            "//a[@class='search_list_item db']")
                        if company_list:
                            company_link = company_list[0].get_attribute(
                                "href")
                            driver.get(company_link)
                            self.wait_10.until(
                                lambda d: d.find_element_by_xpath(
                                    "//div[@id='primaryInfo']"
                                    "/div[@class='details "
                                    "clearfix']").is_displayed())

                            response = HtmlResponse(driver.current_url,
                                                    encoding="utf-8",
                                                    body=driver.page_source)
                            yield self.parse_search(company_name, response)
                    except Exception:
                        self.logger.info("爬取异常:国家企业信用信息公示系统没有%s的相关信息" %
                                         company_name)
                else:
                    yield Request(DO_NOTHING_URL,
                                  self.do_nothing,
                                  errback=self.do_nothing,
                                  dont_filter=True)
            except Exception as e:
                self.logger.warning("爬取异常:{company: %s,message:%s}" %
                                    (company, str(e)))
            finally:
                if hasattr(self, 'driver') and self.driver is not None:
                    self.driver.quit()
예제 #25
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.mongo_instance = MongoDB(MONGO_COMPANY_DB,
                                   MONGO_COMPANY_DETAIL2_COLLECTIONS)
예제 #26
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     with MongoDB(MONGO_COMPANY_DB, MONGO_COMPANY_DETAIL2_COLLECTIONS) as mongo_instance:
         self.name_set = set(item["name"] for item in
                             mongo_instance.getAll(fields={"name": 1, "_id": 0}))
예제 #27
0
class DetailTianyanchaSpider(CompanySpider):
    name = "tianyancha"
    allowed_domains = ["tianyancha.com"]
    start_urls = ["http://www.tianyancha.com/"]

    custom_settings = {
        'DOWNLOAD_DELAY': 3,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mongo_instance = MongoDB(MONGO_COMPANY_DB,
                                      MONGO_COMPANY_DETAIL2_COLLECTIONS)

    def _get_one_company(self):
        while True:
            _id = self.ssdb_conn.qpop_front(SSDB_TIANYANCHA_QUEUE_NAME)
            if _id is not None:
                a_company = self.mongo_instance.getOne(
                    filter={"_id": ObjectId(_id)})
                if a_company is not None:
                    a_company.pop("_id")
                    return a_company
            else:
                return None

    def start_requests(self):
        parse_company_name = self.parse_company_name
        _get_one_company = self._get_one_company
        tianyancha_id_pattern = re_compile(r"(\d+)$")
        while True:
            a_company = _get_one_company()
            if a_company is not None:
                try:
                    tianyancha_id = tianyancha_id_pattern.search(
                        a_company["search_url"]).group(1)
                    a_company["tianyancha_id"] = tianyancha_id
                    request = Request(
                        "http://www.tianyancha.com/near/s.json?id=%s" %
                        tianyancha_id, parse_company_name)
                    request.meta["company_other_info"] = a_company
                    yield request
                except Exception:
                    self.logger.error("No tianyancha_id url(%s)" %
                                      a_company["search_url"])
            else:
                yield Request(DO_NOTHING_URL,
                              self.do_nothing,
                              errback=self.do_nothing,
                              dont_filter=True)

    def parse(self, response):
        meta = response.meta
        tianyancha_id = meta["company_other_info"]["tianyancha_id"]
        yield Request("http://www.tianyancha.com/company/%d.json" %
                      tianyancha_id,
                      self.parse_company,
                      meta=response.meta)

    def parse_company_name(self, response):
        try:
            text = response.text

            if '"state":"ok"' in text:  # 成功
                spider_name = self.name
                name_exists_func = self.is_search_name_exists
                record_name_func = self.record_search_name
                datas = json_loads(text)["data"]
                if "items" in datas:
                    for data in datas["items"]:
                        name = data["name"]
                        if not name:
                            continue

                        if name_exists_func(name):
                            continue
                        record_name_func(name)

                        item = CompanyItem()
                        item["from_web"] = spider_name
                        item[
                            "from_url"] = "http://www.tianyancha.com/company/" + data[
                                "id"]
                        item["area"] = "shenzhen"
                        item["name"] = name
                        yield item
            else:
                self.logger.warning("天眼查---查找相关公司失败,URL(%s)" % response.url)
        except Exception:
            self.logger.exception("天眼查---查找相关公司异常,URL(%s)" % response.url)

    def parse_company(self, response):
        try:
            text = response.text

            if '"state":"ok"' in text:  # 成功
                datas = json_loads(text)
                pass
            else:
                self.logger.error("天眼查---搜索公司失败,URL(%s)" % response.url)
        except Exception:
            self.logger.exception("天眼查---搜索公司异常,URL(%s)" % response.url)
예제 #28
0
    def generate_query(self):
        """
        生成查询条件并存入到查询队列
        :return:
        """
        self.logger.info("query condition init begin!")

        # 查询法院(查询条件)
        mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_CHINACOURT_COLLECTIONS)
        # 设置游标不超时
        cursor = mongo_instance.getAll(fields={
            "_id": 1,
            "name": 1
        },
                                       sort=[("province", MONGO_ASCENDING)],
                                       no_cursor_timeout=True)
        court_list = [court["name"] for court in cursor]
        # 案件类型
        case_type_list = ["1", "2", "3", "4", "5"]
        for court in court_list:
            count = 1
            avg_interval = 0  # 当数量很大的时候直接使用总数/220的数字来代替间隔天数
            avg_interval_first = 0
            start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
            end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
            while True:
                divisor = (count**2) if count != 2 else 2
                # 平均间隔天数
                interval_day = avg_interval if avg_interval > 0 else ceil(
                    (end_date - start_date).days / divisor)
                if avg_interval_first > 0:
                    avg_interval = avg_interval_first
                    avg_interval_first = 0
                self.logger.info("interval_day->%s" % interval_day)

                # 生成查询时间段
                end_date_temp = min(start_date + timedelta(days=interval_day),
                                    end_date)
                query_date = "%s TO %s" % (start_date.strftime("%Y-%m-%d"),
                                           end_date_temp.strftime("%Y-%m-%d"))
                self.logger.info("query_date->%s!" % query_date)
                query_condition = dict()
                query_condition["case_type"] = "0"  # 所有类型
                query_condition["court"] = court
                query_condition["date"] = query_date
                if self.is_query_condition_exists(query_condition):
                    if end_date == end_date_temp:
                        self.logger.info("%s query_condition exists!break!" %
                                         court)
                        break
                    else:
                        start_date = end_date_temp + self.one_day
                        self.logger.info(
                            "%s query_condition exists!continue!" %
                            json_dumps(query_condition))
                        continue
                # 查询到数量小于等于220的加到小于220的列表中,并跳出该循环
                query_count = self.get_count_by_condition(court=court,
                                                          date=query_date)
                if 0 <= query_count <= 220:
                    if query_count > 0:
                        self.record_query_condition(query_condition)
                        self.push_query_condition_queue(query_condition)
                    # 查询结果为0,只保存到mongo并且状态为-1
                    if query_count == 0:
                        self.record_query_condition(query_condition, -1)
                    if end_date == end_date_temp:
                        if count > 1:
                            # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1
                            init_date = "%s TO %s" % (self.start_date,
                                                      self.end_date)
                            query_condition["date"] = init_date
                            self.record_query_condition(query_condition, -1)
                        self.logger.info("%s query condition end!" % court)
                        break
                    else:
                        start_date = end_date_temp + self.one_day
                else:
                    if count > 1:
                        avg_interval_first = avg_interval
                    temp_days = (end_date_temp - start_date).days
                    try:
                        avg_interval = int(180 /
                                           (int(query_count) / temp_days))
                    except ZeroDivisionError:
                        self.logger.exception("爬取出错,出错原因:")
                        break
                    # 如果间隔时间都为1天查询到的结果还大于220的话,则在保存条件的时候再增加案件类型进行保存
                    if temp_days == 1:
                        for case_type in case_type_list:
                            query_condition["case_type"] = case_type
                            if not self.is_query_condition_exists(
                                    query_condition):
                                self.record_query_condition(query_condition)
                                self.push_query_condition_queue(
                                    query_condition)
                        if end_date == end_date_temp:
                            if count > 1:
                                # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1
                                init_date = "%s TO %s" % (self.start_date,
                                                          self.end_date)
                                query_condition["date"] = init_date
                                self.record_query_condition(
                                    query_condition, -1)
                            self.logger.info("%s query condition end!" % court)
                            break
                        else:
                            start_date = end_date_temp + self.one_day
                count += 1
        self.logger.info("query condition init end!")
예제 #29
0
class ProxyApi(object):
    def __init__(self):
        self.mongo_instance = MongoDB(MONGO_PROXY_DB,
                                      MONGO_PROXY_COLLECTIONS)  # 线程安全并带连接池
        self.scheme_filter_dict = {
            SchemeType.HTTP: {
                "$ne": SchemeType.HTTPS
            },
            SchemeType.HTTPS: {
                "$ne": SchemeType.HTTP
            },
            SchemeType.HTTP_OR_HTTPS: {
                "$eq": SchemeType.HTTP_OR_HTTPS
            },
        }
        self.method_filter_dict = {
            SupportMethod.GET: {
                "$ne": SupportMethod.POST
            },
            SupportMethod.POST: {
                "$ne": SupportMethod.GET
            },
            SupportMethod.GET_OR_POST: {
                "$eq": SupportMethod.GET_OR_POST
            },
        }
        self.good_quality_dict = {
            SchemeType.HTTP: {
                "$gte": len(HTTP_CHECK_URL_LIST)
            },
            SchemeType.HTTPS: {
                "$gte": len(HTTPS_CHECK_URL_LIST)
            },
            SchemeType.HTTP_OR_HTTPS: {
                "$gte": len(HTTPS_CHECK_URL_LIST)
            },
        }
        self.good_response_time_dict = {
            SchemeType.HTTP: {
                "$lt": 1,
                "$gte": 0
            },
            SchemeType.HTTPS: {
                "$lt": 3,
                "$gte": 0
            },
            SchemeType.HTTP_OR_HTTPS: {
                "$lt": 1,
                "$gte": 0
            },
        }

    def close(self):
        self.mongo_instance.close()

    def get_proxy_from_kuaidaili(self, stable_time=StableTime.MIN_10):
        try:
            url = "http://dps.kuaidaili.com/api/getdps/?" \
                  "orderid=959308673589451&num=50&sep=2&ut=" + str(stable_time)
            resp = http_get(url)
            if resp.status_code != 200:
                raise NoProxyException

            return resp.text.split()
        except Exception:
            from traceback import print_exc
            print_exc()
            raise NoProxyException

    def get_proxy_all(self,
                      location=None,
                      anonymous=AnonymousLevel.MIDDLE,
                      scheme=SchemeType.HTTP,
                      method=SupportMethod.GET):
        the_filter = {
            "quality": self.good_quality_dict[scheme],
            "response_time": self.good_response_time_dict[scheme],
            # "anonymous_level": {"$lte": anonymous},
            "scheme_type": self.scheme_filter_dict[scheme],
            # "support_method": self.method_filter_dict[method],
        }
        # if location:
        #     the_filter["location"] = re_compile(".*" + location + ".*")

        cursor = self.mongo_instance.getAll(filter=the_filter,
                                            fields={
                                                "ip": 1,
                                                "port": 1,
                                                "_id": 0
                                            },
                                            sort=[("response_time",
                                                   MONGO_ASCENDING)])
        return [item["ip"] + ":" + str(item["port"]) for item in cursor]

    def get_proxy_one(self,
                      location=None,
                      anonymous=AnonymousLevel.MIDDLE,
                      scheme=SchemeType.HTTP,
                      method=SupportMethod.GET,
                      stable_time=StableTime.MIN_10):
        good_proxys = self.get_proxy_all(location, anonymous, scheme, method)
        if good_proxys:
            return rand_choice(good_proxys)
        else:
            raise NoProxyException