示例#1
0
class DetailTianyanchaSpider(CompanySpider):
    name = "tianyancha"
    allowed_domains = ["tianyancha.com"]
    start_urls = ["http://www.tianyancha.com/"]

    custom_settings = {
        'DOWNLOAD_DELAY': 3,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mongo_instance = MongoDB(MONGO_COMPANY_DB,
                                      MONGO_COMPANY_DETAIL2_COLLECTIONS)

    def _get_one_company(self):
        while True:
            _id = self.ssdb_conn.qpop_front(SSDB_TIANYANCHA_QUEUE_NAME)
            if _id is not None:
                a_company = self.mongo_instance.getOne(
                    filter={"_id": ObjectId(_id)})
                if a_company is not None:
                    a_company.pop("_id")
                    return a_company
            else:
                return None

    def start_requests(self):
        parse_company_name = self.parse_company_name
        _get_one_company = self._get_one_company
        tianyancha_id_pattern = re_compile(r"(\d+)$")
        while True:
            a_company = _get_one_company()
            if a_company is not None:
                try:
                    tianyancha_id = tianyancha_id_pattern.search(
                        a_company["search_url"]).group(1)
                    a_company["tianyancha_id"] = tianyancha_id
                    request = Request(
                        "http://www.tianyancha.com/near/s.json?id=%s" %
                        tianyancha_id, parse_company_name)
                    request.meta["company_other_info"] = a_company
                    yield request
                except Exception:
                    self.logger.error("No tianyancha_id url(%s)" %
                                      a_company["search_url"])
            else:
                yield Request(DO_NOTHING_URL,
                              self.do_nothing,
                              errback=self.do_nothing,
                              dont_filter=True)

    def parse(self, response):
        meta = response.meta
        tianyancha_id = meta["company_other_info"]["tianyancha_id"]
        yield Request("http://www.tianyancha.com/company/%d.json" %
                      tianyancha_id,
                      self.parse_company,
                      meta=response.meta)

    def parse_company_name(self, response):
        try:
            text = response.text

            if '"state":"ok"' in text:  # 成功
                spider_name = self.name
                name_exists_func = self.is_search_name_exists
                record_name_func = self.record_search_name
                datas = json_loads(text)["data"]
                if "items" in datas:
                    for data in datas["items"]:
                        name = data["name"]
                        if not name:
                            continue

                        if name_exists_func(name):
                            continue
                        record_name_func(name)

                        item = CompanyItem()
                        item["from_web"] = spider_name
                        item[
                            "from_url"] = "http://www.tianyancha.com/company/" + data[
                                "id"]
                        item["area"] = "shenzhen"
                        item["name"] = name
                        yield item
            else:
                self.logger.warning("天眼查---查找相关公司失败,URL(%s)" % response.url)
        except Exception:
            self.logger.exception("天眼查---查找相关公司异常,URL(%s)" % response.url)

    def parse_company(self, response):
        try:
            text = response.text

            if '"state":"ok"' in text:  # 成功
                datas = json_loads(text)
                pass
            else:
                self.logger.error("天眼查---搜索公司失败,URL(%s)" % response.url)
        except Exception:
            self.logger.exception("天眼查---搜索公司异常,URL(%s)" % response.url)
示例#2
0
class WenshuSpider(NoticeClosedSpider):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ssdb_conn = get_ssdb_conn()
        self.mongo_instance = MongoDB(MONGO_WENSHU_DB,
                                      MONGO_WENSHU_CONDITION_COLLECTIONS)
        self.proxy_api = ProxyApi()
        self.proxy = self.proxy_api.get_proxy_one()
        self.pid = getpid()
        self.lock = Lock()
        self.logger.info("init pid->%d" % self.pid)

    def is_query_condition_exists(self, condition):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            result = self.mongo_instance.getOne(
                filter={"condition": condition},
                fields={
                    "condition": 1,
                    "status": 1,
                    "_id": 0
                })
            if result:
                return True
        except Exception:
            pass
        return False

    def record_query_condition(self, condition, status=0):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            item = {
                "condition": condition,
                "status": status,
            }
            self.mongo_instance.insertOne(item)
        except Exception:
            return

    def push_query_condition_queue(self, condition):
        try:
            condition = json_dumps(self.dict_sorted(condition),
                                   ensure_ascii=False)
            self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition)
        except Exception:
            return

    def clear_query_condition(self):
        try:
            self.mongo_instance.deleteMany(filter={})
            self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE)
        except Exception:
            return

    def get_wenshu_condition(self):
        try:
            return self.ssdb_conn.qpop_front(SSDB_WENSHU_CONDITION_QUEUE)
        except Exception:
            pass
        return {}

    def push_wenshu_condition(self, condition):
        try:
            self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition)
        except Exception:
            return

    def is_wenshu_id_exists(self, file_id):
        try:
            return self.ssdb_conn.hexists(SSDB_WENSHU_ID_HSET, file_id)
        except Exception:
            return True

    def record_wenshu_id_error(self, file_id):
        try:
            self.ssdb_conn.hset(SSDB_WENSHU_ID_ERROR_HSET, file_id, "")
        except Exception:
            return

    def reset_wenshu_condition(self):
        try:
            # 清空队列列表
            self.ssdb_conn.qclear(SSDB_WENSHU_CONDITION_QUEUE)
            # 将hset里面状态为0的插入到队列
            cursor = self.mongo_instance.getAll(filter={"status": 0},
                                                fields={
                                                    "condition": 1,
                                                    "status": 1,
                                                    "_id": 0
                                                })
            for item in cursor:
                self.push_query_condition_queue(item["condition"])
        except Exception:
            pass
        return

    def exception_handle(self, condition, error_info):
        try:
            if self.name != "condition_spider":
                # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh"
                # 出现任何异常,再把出错的查询条件重新再加入到查询队列
                self.push_wenshu_condition(condition)
                self.logger.info("parse or parse_doc error->%s" %
                                 str(error_info))
                # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试
                self.logger.info("sleep start!")
                sleep(5)  # 暂停5秒钟
                self.logger.info("sleep end!")
                # 更换代理
                self.proxy = self.proxy_api.get_proxy_one()  # 更换代理
                self.logger.error("request retry")
                # 重新请求当前条件
                request = Request(url=self.list_url,
                                  method='POST',
                                  callback=self.parse,
                                  body=json_dumps(self.req_data),
                                  headers=self.headers,
                                  dont_filter=True,
                                  errback=self.err_callback)
                self.set_proxy(request)
                yield request
        except Exception:
            self.exception_handle(condition, "change proxy error!")

            # os_system("kill -9 %d" % pid)
            # os_system("kill -9 %d && nohup /opt/test_wenshu/crawler/crawler_bqjr/%s >/dev/null 2>&1 &" % (pid, script_name))
            # def start_wenshu_crawler(spider):
            #     self.logger.info("begin new process")
            #     process = CrawlerProcess(get_project_settings())
            #     process.crawl(spider)
            #     process.start()
            # p = Process(target=start_wenshu_crawler, args=(self,))
            # p.start()
            # # 获取pid并杀死进程,通过nohup再重启下爬虫
            # pid = getpid()
            # self.logger.info("kill pid->%d" % pid)
            # kill(pid, 9)

    def exception_response(self, condition, response):
        if response.status != 200 \
                or "/Html_Pages/VisitRemind.html" in response.text \
                or response.text == "atZtw/muLK3OdYWrljShpg==":
            # 抓取文章出现任何异常,则把出错的信息加入到未抓取到的列表中方便以后查看或者重新采集
            self.exception_handle(condition,
                                  "status code:" + str(response.status))

    def dict_sorted(self, data):
        return sorted(data.items(), key=lambda t: len(t[0]), reverse=True)

    def closed(self, reason):
        if self.name != "condition_spider":
            self.lock.acquire()
            try:
                msg = super().closed(reason)
                self.logger.error("spider closed, pid->%d, reason->%s" %
                                  (self.pid, msg))
                with open("count.txt", "r") as f:
                    count = int(f.read())
                count += 1
                if count >= 20:
                    with open("pid.txt", "r") as f:
                        parent_pid = int(f.read())
                    self.logger.error(
                        "kill pid->%d, parent pid->%d, restart now!" %
                        (self.pid, parent_pid))
                    os_system(
                        "nohup /opt/test_wenshu/crawler/crawler_bqjr/start_app.sh "
                        ">/dev/null 2>&1 &")
                    os_system("kill -9 %d" % self.pid)
                else:
                    with open("count.txt", "w") as f:
                        f.write(str(count))
                    self.logger.error("kill pid->%d" % self.pid)
                    os_system("kill -9 %d" % self.pid)
            except Exception:
                self.logger.error("kill pid or restart error!")
            self.lock.release()

    def err_callback(self, failure):
        self.logger.error("request error->%s" % repr(failure))
        if self.name in ["wenshu_pc_spider", "wenshu_app_spider"]:
            self.exception_handle(self.condition,
                                  "request failure, change proxy!")