Exemplo n.º 1
0
class ZdbPedaily_tzsj:
    def __init__(self):
        self.urls = ["https://zdb.pedaily.cn/inv/p{}/".format(i) for i in range(1, 770)]
        self.util = Util()

        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Cookie": "__uid=1452122016; __fromtype=0; ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1564455299,1564997145,1565057017,1565061687; BAIDU_SSP_lcr=https://www.baidu.com/link?url=mXXXmWT7-LUN6gg9o-kkJIw_k0SkPj9aL3XGvS6wRVmJjG_3dfydZul0mdFS1rSa&wd=&eqid=cf1c52fe000195ab000000065d48f231; __utma=23980325.1444638820.1563415171.1565057028.1565061688.26; __utmc=23980325; __utmz=23980325.1565061688.26.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; __utmb=23980325.5.10.1565061688",
            "Host": "zdb.pedaily.cn",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }

    def get_shareholder(self, id_code, detail_html):
        shareholder_info = detail_html.xpath("//table[@class=\"shareholder-info\"]/tbody/tr")
        if shareholder_info:
            for si in shareholder_info:
                shareholder_name = si.xpath("./td[1]/text()")[0]
                shareholder_type = si.xpath("./td[2]/text()")[0]
                if si.xpath("./td[3]/text()"):
                    shareholder_money = si.xpath("./td[3]/text()")[0]
                else:
                    shareholder_money = ""
                crawl_time = self.util.get_now_time()
                sql_sharholder = "insert into INV_EVT_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \
                                 "values('%s', '%s', '%s', '%s','%s')" % (
                    id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time)

                self.util.insert2mysql("股东信息", sql_sharholder)

    def get_main_people(self, id_code, detail_html):
        main_people = detail_html.xpath("//div[@class=\"business-people\"]/ul/li")
        if main_people:
            for p in main_people:
                mp_name = p.xpath("./h3/text()")[0]
                mp_position = p.xpath("./p/text()")[0]

                crawl_time = self.util.get_now_time()

                sql_main_people = "insert into INV_EVT_MAIN_PSN_INF(ID, MAIN_PPL_NM, MAIN_PPL_POS, INPT_DT) values('%s', '%s', '%s','%s')" % (
                    id_code, mp_name, mp_position, crawl_time)
                self.util.insert2mysql("主要人物", sql_main_people)

    def get_detail_info(self, detail_url):
        self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp())
        detail_res = self.util.get_req(url=detail_url, headers=self.headers)
        print(detail_res.status_code)
        if detail_res.status_code == 200:
            detail_html = self.util.get_xpath_obj(detail_res)
            # 详情页信息获取
            company_name = detail_html.xpath("//h1/text()")[0]
            company_base = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0]
            company_reg_loc = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0]
            company_bound_date = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0]
            company_industry = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0]
            if detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()"):
                company_site = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()")[0]
            else:
                company_site = ""
            if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/pre/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/text()')[0]
            elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'):
                company_intro = detail_html.xpath('//div[@id="cke_pastebin"]//text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'):
                company_intro = detail_html.xpath('//div[@class="box-fix-l"]/ul/text()')[0]
            else:
                company_intro = ""

            if detail_html.xpath("//div[@id=\"business\"]"):
                legal_person = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0]
                founded_time = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0]
                registered_capital = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0]
                operational_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0]
                registered_num = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0]
                approval_date = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0]

                organizational_code = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0]
                creditfcode = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0]
                identification_number = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0]
                registration_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0]
                enterprise_type = detail_html.xpath("//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0]
            else:
                legal_person = ""
                founded_time = ""
                registered_capital = ""
                operational_authority = ""
                registered_num = ""
                approval_date = ""

                organizational_code = ""
                creditfcode = ""
                identification_number = ""
                registration_authority = ""
                enterprise_type = ""
            id_code = self.util.MD5(company_name + creditfcode)
            # 融资事件 信息处理
            for rz_html in detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"):
                if rz_html.xpath("./div[@class=\"view\"]/a/@href")[0].startswith("http"):
                    rz_url = rz_html.xpath("./div[@class=\"view\"]/a/@href")[0]  # 融资事件新开页
                else:
                    rz_url = "https://zdb.pedaily.cn" + rz_html.xpath("./div[@class=\"view\"]/a/@href")[0]  # 融资事件新开页
                print(rz_url)
                rz_res = self.util.get_req(url=rz_url, headers=self.headers)
                if rz_res.status_code == 200:
                    rz_html = self.util.get_xpath_obj(rz_res.text)
                    # 投资事件 信息获取
                    rz_title = rz_html.xpath("//h1/text()")[0]
                    rz_info = "".join(rz_html.xpath("//div[@class=\"info\"]/ul/li//text()"))
                    rz_intro = rz_html.xpath("//div[@id=\"desc\"]/p/text()")[0]

                    crawl_time = self.util.get_now_time()

                    sql_rzsj = """insert into INV_EVT_INF(ID,CMP_NM,ORG_TOT_DEPT,REG_PLC_PNT,CMP_SET_UP_TM,AFL_IDT,FORML_WEB,CMP_INTRO,LVRG_NM,LVRG_INF,LVGR_DTL,LGP_INF,SET_UP_TM,REG_CPT,OPR_RIT,REG_NBR,APRV_TM,ORG_ORG_CD_NBR,SOC_CRD_CD,TAX_PSN_RCG_NBR,REG_INSTT,ENTP_TYP,INPT_DT
                                                                            )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                                                                            """ % (id_code,
                                                                                   company_name,
                                                                                   company_base,
                                                                                   company_reg_loc,
                                                                                   company_bound_date,
                                                                                   company_industry,
                                                                                   company_site,
                                                                                   company_intro,
                                                                                   rz_title,
                                                                                   rz_info,
                                                                                   rz_intro,
                                                                                   legal_person,
                                                                                   founded_time,
                                                                                   registered_capital,
                                                                                   operational_authority,
                                                                                   registered_num,
                                                                                   approval_date,
                                                                                   organizational_code,
                                                                                   creditfcode,
                                                                                   identification_number,
                                                                                   registration_authority,
                                                                                   enterprise_type,
                                                                                   crawl_time)
                    self.util.insert2mysql("融资公司信息", sql_rzsj)
            self.get_main_people(id_code, detail_html)
            self.get_shareholder(id_code, detail_html)

    def get_items_list(self, res):
        html = self.util.get_xpath_obj(res)
        for li in html.xpath("//ul[@id=\"inv-list\"]/li"):
            time.sleep(2)
            # 详情页获取
            if li.xpath("./div[1]/a/@href"):
                detail_url = "https://zdb.pedaily.cn" + li.xpath("./div[1]/a/@href")[0]  # 地址获取
            else:
                continue
            print(detail_url)
            self.get_detail_info(detail_url)

    def run(self):
        self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp())
        for url in self.urls:
            print("列表页:" + url + "开始爬取")
            res = self.util.get_req(url=url, headers=self.headers)  # 列表页列表获取
            self.get_items_list(res)
Exemplo n.º 2
0
class Jobui:
    def __init__(self):
        self.url = "https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword="
        self.base_url = "https://www.jobui.com/cmp?" \
                        "area=%E5%85%A8%E5%9B%BD&industry={}&worker={}&impression={}&type={}&n={}"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,"
            "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Connection":
            "keep-alive",
            "Cookie":
            "jobui_p=1565753151227_21067661; "
            "jobui_area=%25E6%25B7%25B1%25E5%259C%25B3; "
            "Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1565753152,1567047709,1567585344; "
            "PHPSESSID=kkdnm8jingh5vq1g7e1ora7pe3; "
            "jobui_img_logo=vbBZkTB2kbhlgdb8yFiTPdmw4wCW3uKOYJ%2F4lauoW4o%3D; "
            "TN_VisitCookie=42; TN_VisitNum=33; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1567585986",
            "Host":
            "www.jobui.com",
            "Pragma":
            "no-cache",
            "Referer":
            "https://www.jobui.com/cmp",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        self.u = Util()
        self.cursor = self.u.MySQL().cursor()
        self.data = {"area": "全国", "keyword": ""}
        self.base_data = {
            "area": "全国",
            "industry": "",
            "worker": "",
            "impression": "",
            "type": ""
        }
        self.re_try_list = []
        self.proxies = self.get_proxy()

    def get_proxy(self):
        sql = "select ip, tp from ip_pool where tof = '1';"
        self.cursor.execute(sql)
        proxy = self.cursor.fetchall()
        proxies = {}
        for p in range(len(proxy)):
            proxies[proxy[p][0]] = proxy[p][1]
        return proxies

    def handle_data(self, req):
        if req.status_code == 200:
            html = self.u.get_xpath_obj(req.text)
            if html.xpath("//div[@class=\"no-result\"]"):
                print(">>>>>页面无数据")
            else:
                urls = [
                    "https://www.jobui.com" + i for i in html.xpath(
                        "//div[@class=\"company-segmetation\"]/a/@href")
                ]
                for url in urls:
                    print(url)
                    try:
                        # 解决多余警告
                        requests.packages.urllib3.disable_warnings()
                        proxy_key = random.choice(list(self.proxies.keys()))
                        print("<{}>".format(proxy_key))
                        proxies = {proxy_key: self.proxies[proxy_key]}
                        detail_req = requests.get(url=url,
                                                  headers=self.headers,
                                                  proxies=proxies,
                                                  verify=False)
                    except requests.exceptions.ConnectionError:
                        self.re_try_list.append(url)
                        print("网页未被请求到,已加入重试列表。")
                        continue
                    print("详情页请求完成,响应代码为:{}".format(detail_req.status_code))
                    detail_html = self.u.get_xpath_obj(detail_req.text)
                    if len(
                            detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dt")) == 4:
                        title = detail_html.xpath("//h1/a/text()")[0].strip()
                        if detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                        ):
                            brief_intro = detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                            )[0].strip()
                        else:
                            brief_intro = ""
                        xingzhi, guimo = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[1]/text()"
                        )[0].split(" / ")
                        hangye = ";".join([
                            i.strip() for i in detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()"
                            )
                        ])
                        rongzi = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        quancheng = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        intro = "".join(
                            detail_html.xpath(
                                "//*[@id=\"textShowMore\"]/text()")).strip()
                    if len(
                            detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dt")) == 3:
                        title = detail_html.xpath("//h1/a/text()")[0].strip()
                        if detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                        ):
                            brief_intro = detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                            )[0].strip()
                        else:
                            brief_intro = ""
                        xingzhi, guimo = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[1]/text()"
                        )[0].split(" / ")
                        hangye = ";".join([
                            i.strip() for i in detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()"
                            )
                        ])
                        rongzi = ""
                        quancheng = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        intro = "".join(
                            detail_html.xpath(
                                "//*[@id=\"textShowMore\"]/text()")).strip()
                    else:
                        quancheng = ""
                        title = ""
                        brief_intro = ""
                        xingzhi = ""
                        guimo = ""
                        hangye = ""
                        rongzi = ""
                        quancheng = ""
                        intro = ""
                    id_code = self.u.MD5(quancheng)
                    crawl_time = self.u.get_now_time()
                    sql = "insert into tmp_jobui(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, crawl_time) " \
                          "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                          % (id_code, title, brief_intro, xingzhi,
                             guimo, hangye, rongzi, quancheng,
                             pymysql.escape_string(intro), crawl_time)
                    self.u.insert2mysql(title, sql)
                    print("-" * 100)
                    # time.sleep(3)
        else:
            print("请求失败,错误代码为:{}".format(req.status_code))

    def re_try(self):
        for rt in self.re_try_list:
            industry = re.findall(r'industry=(.*?)&', rt)[0]
            worker = re.findall(r'worker=(.*?)&', rt)[0]
            impression = re.findall(r'impression=(.*?)&', rt)[0]
            type = re.findall(r'type=(.*?)&', rt)[0]
            n = re.findall(r'n=(.*?)', rt)[0]

            self.base_data["industry"] = industry
            self.base_data["worker"] = worker
            self.base_data["impression"] = impression
            self.base_data["type"] = type
            self.base_data["n"] = n
            try:
                proxy_key = random.choice(list(self.proxies.keys()))
                print("<{}>".format(proxy_key))
                proxies = {proxy_key: self.proxies[proxy_key]}
                requests.packages.urllib3.disable_warnings()
                r = requests.get(url=rt,
                                 headers=self.headers,
                                 data=self.base_data,
                                 proxies=proxies)
                self.handle_data(r)
            except requests.exceptions.ConnectionError:
                self.re_try_list.append(rt)
                continue

    def main(self):
        proxy_key = random.choice(list(self.proxies.keys()))
        print("<{}>".format(proxy_key))
        proxies = {proxy_key: self.proxies[proxy_key]}
        try:
            requests.packages.urllib3.disable_warnings()
            res = requests.get(url=self.url,
                               headers=self.headers,
                               data=self.data,
                               proxies=proxies,
                               verify=False)
            print("请求状态码:" + str(res.status_code))
        except Exception as e:
            print("request has Error,Mes:" + str(e))
            time.sleep(300)
            proxy_key = random.choice(list(self.proxies.keys()))
            print("<{}>".format(proxy_key))
            proxies = {proxy_key: self.proxies[proxy_key]}
            requests.packages.urllib3.disable_warnings()
            res = requests.get(url=self.url,
                               headers=self.headers,
                               data=self.data,
                               proxies=proxies,
                               verify=False)
        if res.status_code == 200:
            html = self.u.get_xpath_obj(res.text)
            hangye = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()")
            xingzhi = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()")
            guimo = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[3]/div/div/a/text()")
            tese = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[4]/div/div/a/text()")
            for a in hangye[1:]:
                # time.sleep(10)
                for b in xingzhi[1:]:
                    # time.sleep(10)
                    for c in guimo[1:]:
                        # time.sleep(10)
                        for d in tese[1:]:
                            # time.sleep(5)
                            for i in range(1, 51):
                                # 构建请求地址
                                print("开始构建请求地址")
                                # time.sleep(2)
                                use_url = self.base_url.format(
                                    self.u.url_encode(a), self.u.url_encode(c),
                                    self.u.url_encode(d), self.u.url_encode(b),
                                    i)
                                # 构建请求参数列表
                                self.base_data["industry"] = a
                                self.base_data["worker"] = c
                                self.base_data["impression"] = d
                                self.base_data["type"] = b
                                try:
                                    proxy_key = random.choice(
                                        list(self.proxies.keys()))
                                    print("<{}>".format(proxy_key))
                                    proxies = {
                                        proxy_key: self.proxies[proxy_key]
                                    }
                                    requests.packages.urllib3.disable_warnings(
                                    )
                                    r = requests.get(url=use_url,
                                                     headers=self.headers,
                                                     data=self.base_data,
                                                     proxies=proxies)
                                except requests.exceptions.ConnectionError:
                                    self.re_try_list.append(use_url)
                                    continue
                                self.handle_data(r)
                            # time.sleep(10)
            self.re_try()
        elif res.status_code == 403:
            print("403 Forbidden")
Exemplo n.º 3
0
class Qlm_zbbg:
    def __init__(self):
        self.base_url = "http://www.qianlima.com/zbbg/p{}"
        self.page = 200
        self.util = Util()
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "__jsluid_h=144847f002c5e67a5b7bf1888f49e19c; UM_distinctid=16c02c0e9b53d5-083f7603340745-e343166-144000-16c02c0e9b6403; gr_user_id=bfb0c075-bcf5-4e05-a943-8b3448f39a0d; Hm_lvt_0a38bdb0467f2ce847386f381ff6c0e8=1563432734; LXB_REFER=www.baidu.com; bridgeid=59454367; keywordUnit=40461; keywords=%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91; CNZZDATA1277608403=172402465-1563412202-%7C1563498692; BAIDU_SSP_lcr=https://www.baidu.com/link?url=BUcmE5CDcuTFAv7tI05xeq_80sbO-X-vNsQ1yhUvF_DGdoPt-o7VQs8t7AYRpXBm&wd=&eqid=da58e9c4000e34dc000000065d312603; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563414294,1563432734,1563432760,1563502122; qlm_old=\"http://www.qianlima.com/zb/detail/20190719_139475196.html\"; Hm_lpvt_0a38bdb0467f2ce847386f381ff6c0e8=1563502180; qlm_username=15561585051; qlm_password=RCf8ujm8K3EfguKmBCouKpgCKK7uopgU; rem_login=1; qlmll_his=\",139475750,139491436,139497668,139475763,139475196,139264733,139264636,139269995,\"; seo_refUrl=\"http://www.directlyaccess.com\"; seo_curUrl=\"http://www.qianlima.com/common/cat.jsp\"; CNZZDATA1848524=cnzz_eid%3D430053542-1563409337-%26ntime%3D1563503598; fromWhereUrl=\"http://www.qianlima.com/zbbg/\"; seo_intime=\"2019-07-19 10:57:07\"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563506743",
            "Host":
            "www.qianlima.com",
            "Referer":
            "http://www.qianlima.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }

    def get_url_mysql(self):
        for i in range(200):
            url = self.base_url.format(i)
            res = self.util.get_req(url=url, headers=self.headers)

            html = self.util.get_xpath_obj(res)
            for dl in html.xpath("//div[@class=\"sevenday_list\"]/dl"):
                detail_url = dl.xpath("./dt/a/@href")[0].strip()
                sql = "insert into qlm_zbbg_url(url,status) values ('%s','0')" % detail_url
                self.util.insert2mysql(detail_url, sql)
        self.util.MySQL().close()

    def get_mess(self):
        conn = self.util.MySQL()
        cursor = conn.cursor()
        sql = "select url from qlm_zbbg_url where status=0;"
        cursor.execute(sql)
        for detail_url in cursor.fetchall():
            print(detail_url[0])
            detail_html = self.util.get_xpath_obj(
                self.util.get_req(url=detail_url[0],
                                  headers=self.headers).text)
            try:
                detail_title = detail_html.xpath("//h2/text()")[0]
                detail_location = "".join(
                    detail_html.xpath("//span[@class=\"site\"]/a//text()"))
                detail_status = detail_html.xpath(
                    "//span[@class=\"zhuangtai\"]//text()")[0].replace(
                        "状态:", "")
                detail_date = detail_html.xpath(
                    "//span[@class=\"d2\"]/text()")[0]
                detail_content = re.findall(
                    r'<div id="wen".*?</div>',
                    self.util.get_req(url=detail_url[0],
                                      headers=self.headers).text,
                    re.S)[0].replace("\"", "\\\"").replace("\'", "\\\'")
                record_id = self.util.MD5(detail_title + detail_location)
                crawl_time = self.util.get_now_time()
                sql = """insert into INVT_PUB_BID_MDF_INF(ID, TTL, ZON, STS, INVT_PUB_BID_CNTNT, ISU_TM, DTL_LINK, INPT_DT,)
                                                    values('%s','%s','%s','%s','%s','%s','%s','%s')""" \
                      % (record_id,
                         detail_title,
                         detail_location,
                         detail_status,
                         detail_date,
                         detail_content,
                         detail_url[0],
                         crawl_time)
                up_sql = "update qlm_zbbg_url set status = 1  where url = '{}';".format(
                    detail_url[0])
                self.util.insert2mysql(detail_title, sql, up_sql)
                conn.commit()
            except IndexError:
                print("详情页请求失败")
                time.sleep(86400)
                q = Qlm_zbbg()
                q.run()

    def run(self):
        self.get_url_mysql()
        self.get_mess()
Exemplo n.º 4
0
class WebapiCninfo:
    def __init__(self):
        self.get_code_key_h = {
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
            "Cache-Control": "max-age=0",
            "Accept": "image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN",
            "Accept-Encoding": "gzip, deflate",
            "Host": "webapi.cninfo.com.cn",
            "Connection": "Keep-Alive",
            "Cookie": "cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557528,1564557544,1564557814,1564557966; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}",
        }
        self.get_loc_mess_h = {
            "Origin": "http://webapi.cninfo.com.cn",
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
            "Cache-Control": "max-age=0",
            "Accept": "*/*",
            "Accept-Language": "zh-CN",
            "mcode": "{}",
            "X-Requested-With": "XMLHttpRequest",
            "Accept-Encoding": "gzip, deflate",
            "Content-Length": "0",
            "Host": "webapi.cninfo.com.cn",
            "Connection": "Keep-Alive",
            "Pragma": "no-cache",
            "Cookie": "UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557966,1564558754,1564559126,{}; codeKey={}",
        }
        self.get_comp_name_h = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Content-Length": "0",
            "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564622577,1564623888,1564625108,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; codeKey={}",
            "Host": "webapi.cninfo.com.cn",
            "mcode": "{}",
            "Origin": "http://webapi.cninfo.com.cn",
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
        }
        self.get_data_h = {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Connection": "keep-alive",
                "Content-Length": "0",
                "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; codeKey={}; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564623888,1564625108,1564625230,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}",
                "Host": "webapi.cninfo.com.cn",
                "mcode": "{}",
                "Origin": "http://webapi.cninfo.com.cn",
                "Referer": "http://webapi.cninfo.com.cn/",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/75.0.3770.100 Safari/537.36",
                "X-Requested-With": "XMLHttpRequest",
            }
        self.get_data_d = {
            "scode": "",
            "sdate": "",
            "edate": "",
            "type": "071001",
            "@column": "SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N,F009N,F010N,F011N,F012N"
                       ",F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,F024N,F025N,F026N,F027N"
                       ",F028N,F029N,F030N,F031N,F032N,F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N"
                       ",F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N"
                       ",F059N,F060N,F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N"
                       ",F074N,F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,F088N"
                       ",F089N,F090N,F091N",
        }
        self.get_comp_name_d = {
            "platetype": "{}",
            "platecode": "{}",
            "@orderby": "SECCODE:asc",
            "@column": "SECCODE,SECNAME",
        }
        self.session = requests.Session()
        self.util = Util()
        self.get_code_url = "http://webapi.cninfo.com.cn/api-cloud-platform/login/getVerfyCode"
        self.get_loc_url = "https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1016"
        self.d_date = [i + j for i in ["2017", "2018", "2019"] for j in ["0331", "0630", "0930", "1231"]]

    def parse_json(self, content):
        content = self.util.get_json_obj(content)
        datas = content["records"][3]["children"]
        return ["http://webapi.cninfo.com.cn/{}?{}&@column=SECCODE,SECNAME"\
                .format(data["API"], data["PARAM"]) for data in datas]

    def parse_data(self, data):
        y = self.get_data_d["sdate"][:4]
        if self.get_data_d["sdate"][4:6] == "03":
            quarter = "第一季度"
        elif self.get_data_d["sdate"][4:6] == "06":
            quarter = "第二季度"
        elif self.get_data_d["sdate"][4:6] == "09":
            quarter = "第三季度"
        elif self.get_data_d["sdate"][4:6] == "12":
            quarter = "第四季度"
        else:
            quarter = "--"
        if isinstance(data, str):
            data = self.util.get_json_obj(data)
        for d in data["records"]:
            id_code = self.util.MD5(d["SECNAME"] + y + quarter)
            print(d["SECNAME"])
            sql = """insert into  webapi_cninfo(id,
                    SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,
                    F006N,F007N,F008N,F009N,F010N,F011N,F012N,F013N,F014N,
                    F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,
                    F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,
                    F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,
                    F043N,F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,
                    F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,
                    F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,
                    F070N,F071N,F072N,F073N,F074N,F075N,F076N,F077N,F078N,
                    F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,
                    F088N,F089N,F090N,F091N,y,quarter,crawl_time)
                    values
                    ('%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s')""" \
                  % (
                    id_code,
                    d["SECCODE"],
                    d["SECNAME"],
                    d["STARTDATE"],
                    d["ENDDATE"],
                    d["F001D"],
                    d["F002V"],
                    d["F003V"],
                    d["F006N"],
                    d["F007N"],
                    d["F008N"],
                    d["F009N"],
                    d["F010N"],
                    d["F011N"],
                    d["F012N"],
                    d["F013N"],
                    d["F014N"],
                    d["F015N"],
                    d["F016N"],
                    d["F017N"],
                    d["F018N"],
                    d["F019N"],
                    d["F020N"],
                    d["F021N"],
                    d["F022N"],
                    d["F023N"],
                    d["F024N"],
                    d["F025N"],
                    d["F026N"],
                    d["F027N"],
                    d["F028N"],
                    d["F029N"],
                    d["F030N"],
                    d["F031N"],
                    d["F032N"],
                    d["F033N"],
                    d["F034N"],
                    d["F035N"],
                    d["F036N"],
                    d["F037N"],
                    d["F038N"],
                    d["F039N"],
                    d["F040N"],
                    d["F041N"],
                    d["F043N"],
                    d["F044N"],
                    d["F045N"],
                    d["F046N"],
                    d["F047N"],
                    d["F048N"],
                    d["F049N"],
                    d["F050N"],
                    d["F051N"],
                    d["F052N"],
                    d["F053N"],
                    d["F054N"],
                    d["F055N"],
                    d["F056N"],
                    d["F057N"],
                    d["F058N"],
                    d["F059N"],
                    d["F060N"],
                    d["F061N"],
                    d["F062N"],
                    d["F063N"],
                    d["F064N"],
                    d["F065N"],
                    d["F066N"],
                    d["F067N"],
                    d["F068N"],
                    d["F069N"],
                    d["F070N"],
                    d["F071N"],
                    d["F072N"],
                    d["F073N"],
                    d["F074N"],
                    d["F075N"],
                    d["F076N"],
                    d["F077N"],
                    d["F078N"],
                    d["F079N"],
                    d["F080N"],
                    d["F081N"],
                    d["F082N"],
                    d["F083N"],
                    d["F084N"],
                    d["F085N"],
                    d["F086N"],
                    d["F087N"],
                    d["F088N"],
                    d["F089N"],
                    d["F090N"],
                    d["F091N"],
                    y,
                    quarter,
                    self.util.get_now_time()
                                        )
            self.util.insert2mysql(d["SECNAME"], sql)
            time.sleep(0.3)

    def cut_comp_code(self, scode, codekey, ts):
            # 请求数据的base_url
            data_url = "http://webapi.cninfo.com.cn/api/stock/p_stock2332?scode={}" \
                       "&sdate=20190331&edate=20190331&type=071001&" \
                       "@column=SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N," \
                       "F009N,F010N,F011N,F012N,F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N," \
                       "F022N,F023N,F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,F033N,F034N," \
                       "F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N,F044N,F045N,F046N,F047N,F048N," \
                       "F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,F061N," \
                       "F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N,F074N," \
                       "F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N," \
                       "F088N,F089N,F090N,F091N".format(scode)
            stamp = self.util.get_stamp()  # 统一时间戳
            # 生成新的请求headers
            self.get_data_h["Cookie"] = self.get_data_h["Cookie"].format(codekey, stamp, stamp)
            self.get_data_h["mcode"] = self.get_data_h["mcode"].format(self.util.base64_encode(ts).decode("utf-8"))
            self.get_data_d["scode"] = scode
            data = self.session.post(url=data_url, headers=self.get_data_h, data=self.get_data_d).text
            self.parse_data(data)

    # 处理公司的json数据
    def parse_comp_json(self, json_res, codekey, ts):
        content = self.util.get_json_obj(json_res)
        ls_comp_code = []
        for c in content["records"]:
            ls_comp_code.append(c["SECCODE"])  # 得到公司代码

        if len(ls_comp_code) % 20 == 0:
            loop = int(len(ls_comp_code) / 20)
        else:
            loop = int(len(ls_comp_code) / 20)
        for dd in self.d_date:
            print(dd)
            self.get_data_d["sdate"] = dd
            self.get_data_d["edate"] = dd
            s = 0
            e = 20
            for _ in range(loop):
                time.sleep(1.5)
                scode = ",".join(ls_comp_code[s:e])
                s += 20
                if e < len(ls_comp_code):
                    e += 20
                else:
                    e = len(ls_comp_code)

                self.cut_comp_code(scode, codekey, ts)
            time.sleep(30)

    # 获取所有公司名称
    def get_comp_name(self, get_loc_res, codekey, ts):
        # 获取公司名称
        for get_comp_name_url in self.parse_json(get_loc_res):
            # 处理请求参数
            self.get_comp_name_h["Cookie"] = self.get_comp_name_h["Cookie"] \
                .format(self.util.get_stamp(), self.util.get_stamp(), codekey)
            self.get_comp_name_h["mcode"] = self.get_comp_name_h["mcode"].format(self.util.base64_encode(ts))
            self.get_comp_name_d["platetype"] = self.get_comp_name_d["platetype"].format(
                re.findall(r'platetype=(\d+)&', get_comp_name_url)[0])
            self.get_comp_name_d["platecode"] = self.get_comp_name_d["platecode"].format(
                re.findall(r'platecode=(\d+)&', get_comp_name_url)[0])
            # 开始请求公司名称
            comp_name_res = self.session.post(url=get_comp_name_url,
                                              headers=self.get_comp_name_h,
                                              data=self.get_comp_name_d).text
            self.parse_comp_json(comp_name_res, codekey, ts)

    def main(self):
        # 请求网页,为得到本次会话的codekey 值
        self.get_code_key_h["Cookie"] = self.get_code_key_h["Cookie"].format(int(time.time()))  # 构造headers
        get_code_res = self.session.get(url=self.get_code_url, headers=self.get_code_key_h, verify=False)
        ts = int(time.time())  # 获取本次会话的时间戳
        codekey = re.findall(r'codeKey=(.*?);', get_code_res.headers["Set-Cookie"])[0]  # 得到codekey
        # 得到以地区分类的网页
        self.get_loc_mess_h["mcode"] = self.get_loc_mess_h["mcode"].format(self.util.base64_encode(ts))
        self.get_loc_mess_h["Cookie"] = self.get_loc_mess_h["Cookie"]\
            .format(self.util.get_stamp(), self.util.get_stamp(), codekey)
        get_loc_res = self.session.post(url=self.get_loc_url, headers=self.get_loc_mess_h).text
        # 处理获取公司名称
        self.get_comp_name(get_loc_res, codekey, ts)
Exemplo n.º 5
0
class FemhzsMofcomGov:
    def __init__(self):
        self.base_url = "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList_nav.pageNoLink.html?" \
                        "session=T&sp={}&sp=S+_t1.CORP_CDE%2C+_t1.id&sp=T&sp=S"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
            "application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Connection":
            "keep-alive",
            "Cookie":
            "JSESSIONID=ACBDC30A40FD783627A075ADB9440B4D; insert_cookie=56224592  ",
            "Host":
            "femhzs.mofcom.gov.cn",
            "Referer":
            "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/75.0.3770.100 Safari/537.36",
        }
        self.f_headers = {
            "Host": "femhzs.mofcom.gov.cn",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Referer": "http://www.mofcom.gov.cn/publicService.shtml",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9"
        }
        self.util = Util()
        self.conn = self.util.MySQL()

        self.page = 0

    def insert2mysql(self, sql):
        try:
            self.conn.cursor().execute(sql)
            self.conn.commit()
            print("插入成功")
        except pymysql.err.IntegrityError:
            print("插入失败,数据重复")
            self.conn.rollback()
        except pymysql.err.ProgrammingError:
            print("数据异常,已回滚")
            self.conn.rollback()

    def run(self):
        first_req = requests.get(
            url="http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html",
            headers=self.f_headers)
        cookies = first_req.headers["Set-Cookie"].replace(
            " Path=/fecpmvc,", "").replace("; path=/", "")
        try:
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        except TimeoutError:
            time.sleep(10)
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        except http.client.RemoteDisconnected:
            time.sleep(10)
            self.headers["User-Agent"] = random.choice(User_Agent)
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        print("共有:{} 页".format(page))
        for i in range(1, int(page)):
            print(i)
            data = {
                "session": "T",
                "sp": i,
                "sp": "S _t1.CORP_CDE, _t1.id",
                "sp": "T",
                "sp": "S",
            }
            self.headers["Cookie"] = cookies
            url = self.base_url.format(i)
            try:
                res = requests.get(url=url,
                                   headers=self.headers,
                                   data=data,
                                   timeout=15)
            except TimeoutError:
                time.sleep(10)
                res = requests.get(url=url,
                                   headers=self.headers,
                                   data=data,
                                   timeout=15)
            time.sleep(2)
            if res.status_code == 200:
                print("请求成功,开始解析")
                html = etree.HTML(res.text)
                for tr in html.xpath("//table[@class=\"m-table\"]/tbody/tr"):
                    company_name = tr.xpath("./td[1]/text()")[0].strip()
                    investor_name = tr.xpath("./td[2]/text()")[0].strip()
                    country = tr.xpath("./td[3]/text()")[0].strip()
                    # 公司名称编码作为id
                    md5_company = self.util.MD5(company_name)
                    # 获取当前时间
                    otherStyleTime = self.util.get_now_time()

                    sql = "insert into EXT_INV_ENTP_LST_INF(ID, OVS_INV_ENTP_NM, OVS_INV_NM, INV_CNR, INPT_DT)values('%s','%s','%s','%s','%s')" % (
                        md5_company, company_name, investor_name, country,
                        otherStyleTime)
                    self.insert2mysql(sql)
            else:
                print("请求失败, HTTP Code:{}".format(res.status_code))
Exemplo n.º 6
0
class Jobui:
    def __init__(self):
        self.util = Util()
        self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,"
            "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Connection":
            "keep-alive",
            "Host":
            "www.jobui.com",
            "Pragma":
            "no-cache",
            "Referer":
            "https://www.jobui.com/cmp",
            "Cookie":
            "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }

    def load(self):
        if os.path.exists("Scrapyed.txt"):
            with open("Scrapyed.txt", 'r', encoding="utf8") as f:
                return f.read()
        else:
            print("文件不存在!!!!")

    # 处理数据的总方法
    def parse(self):
        req_area = self.util.get_req(url=self.url, headers=self.headers)
        res_html = self.util.get_xpath_obj(req_area.text)
        for dd in res_html.xpath(
                "//dl[@class=\"j-change\"]/dd")[4:5]:  # 遍历多行dd(省份)
            for area in dd.xpath("./a"):  # 遍历行内区域(市级)
                every_url = "https:" + area.xpath("./@href")[
                    0]  # 按照城市列表分别请求和处理
                print(area.xpath("./text()")[0])
                print("每个城市的url: " + every_url)
                self.parse_area_page(
                    self.util.get_req(url=every_url, headers=self.headers))

    # 处理地区页面
    def parse_area_page(self, response):
        area_html = self.util.get_xpath_obj(response.text)
        tese = area_html.xpath(
            "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()")
        for a in [
                "其他行业", "贸易/进出口", "新能源", "广告", "互联网/电子商务", "教育/培训/院校",
                "电子技术/半导体/集成电路", "专业服务(咨询、人力资源、财会)", "建筑/建材/工程", "家居/室内设计/装潢",
                "房地产", "公关/市场推广/会展", "金融/投资/证券", "快速消费品(食品、饮料、化妆品)", "汽车及零配件",
                "家具/家电/玩具/礼品", "餐饮业", "外包服务", "计算机软件", "机械/设备/重工", "批发/零售",
                "中介服务", "外包服务", "酒店/旅游", "仪器仪表/工业自动化", "服装/纺织/皮革", "医疗/护理/卫生",
                "影视/媒体/艺术/文化传播", "制药/生物工程", "交通/运输/物流", "美容/保健", "环保",
                "原材料和加工", "通信/电信/网络设备", "石油/化工/矿产/地质", "娱乐/休闲/体育", "物业管理/商业中心",
                "印刷/包装/造纸", "农/林/牧/渔", "娱乐/休闲/体育", "电气/电力/水利", "医疗设备/器械", "保险",
                "学术/科研", "采掘业/冶炼", "计算机服务(系统、数据服务、维修)", "会计/审计", "生活服务",
                "计算机硬件", "其他"
        ]:
            for b in [
                    "民营公司", "国企", "合资", "上市公司", "创业公司", "外资", "事业单位", "外企代表处",
                    "非营利机构", "其他性质"
            ]:
                for c in [
                        "50-99", "少于50", "100-499", "500-999", "1000-4999",
                        "5000-9999", "10000以上"
                ]:
                    for d in tese[1:]:
                        use_url = response.request.url \
                                  + "&industry={}".format(self.util.url_encode(a)) \
                                  + "&type={}".format(self.util.url_encode(b)) \
                                  + "&worker={}".format(self.util.url_encode(c)) \
                                  + "&impression={}".format(self.util.url_encode(d))
                        print(d)
                        print(use_url)
                        self.parse_list(use_url)
                        print("-" * 150)
                        time.sleep(0.5)
                time.sleep(0.5)
            time.sleep(1)
        time.sleep(1.5)
        # hangye = []
        # xingzhi = []
        # areacode = []
        # guimo = []
        # tese = []
        # for t in area_html.xpath("//div[@class=\"job-select-box\"]/ul/li"):
        #     if "其他行业" in t.xpath("./div/div/a/text()"):
        #         hangye = t.xpath("./div/div/a/text()")
        #     if "民营公司" in t.xpath("./div/div/a/text()"):
        #         xingzhi = t.xpath("./div/div/a/text()")  # 公司性质列表
        #     if [ac for ac in t.xpath("./div/div/a/@href")[1:] if "areaCode" in ac]:
        #         areacode = [re.findall(r'areaCode=(\d+)', ac)[0] for ac in t.xpath("./div/div/a/@href")[1:]]  # 区域代码的提取
        # if "50-99" in t.xpath("./div/div/a/text()"):
        #     guimo = t.xpath("./div/div/a/text()")  # 公司规模列表
        # print(1)
        # print("hangye: " + str(hangye))
        # print("xingzhi: " + str(xingzhi))
        # print("areacode: " + str(areacode))
        # print("guimo: " + str(guimo))
        # if areacode:
        #     for code in areacode:
        #         for a in hangye[1:]:
        #             for b in xingzhi[1:]:
        #                 print(code + " " + a + " " + b)
        #                 use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \
        #                           + "&type={}".format(self.util.url_encode(b)) \
        #                           + "&areaCode={}".format(code)
        #                 print(use_url)
        #                 r = self.util.get_req(url=use_url, headers=self.headers)
        #                 print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"))
        #                 if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"):
        #                     if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000:
        #                         if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"):
        #                             tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()")
        #                         if tese[1:]:
        #                             for d in tese[1:]:
        #                                 use_url = use_url + "&impression={}".format(self.util.url_encode(d))
        #                                 print(d)
        #                                 print(use_url)
        #                                 self.parse_list(use_url)
        #                         else:
        #                             print("企业特色暂无!!!!")
        #                     else:
        #                         if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0:
        #                             self.parse_list(use_url)
        #                         else:
        #                             pass
        #                 else:
        #                     print("页面暂无数据!!!")
        #                 time.sleep(0.1)
        #             time.sleep(0.5)
        #         time.sleep(1)
        # else:
        #     print("该城市不存在区级!!")
        #     for a in hangye[1:]:
        #         for b in xingzhi[1:]:
        #             use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \
        #                       + "&type={}".format(self.util.url_encode(b))
        #             print(use_url)
        #             r = self.util.get_req(url=use_url, headers=self.headers)
        #             print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"))
        #             if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"):
        #                 if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000:
        #                     if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"):
        #                         tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()")
        #                     if tese[1:]:
        #                         for d in tese[1:]:
        #                             use_url = use_url + "&impression={}".format(self.util.url_encode(d))
        #                             print(d)
        #                             print(use_url)
        #                             self.parse_list(use_url)
        #                     else:
        #                         print("企业特色暂无!!!!")
        #                 else:
        #                     if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0:
        #                         self.parse_list(use_url)
        #                     else:
        #                         pass
        #             else:
        #                 print("页面暂无数据!!!")
        #             time.sleep(0.1)
        #         time.sleep(0.5)
        #     time.sleep(1)

    # 处理 每一个列表页的方法
    def parse_list_page(self, line):
        for i in range(1, 51):
            print("第{}页开始抓取".format(i))
            page_url = line + "&n={}".format(i)
            rep = self.util.get_xpath_obj(
                self.util.get_req(url=page_url, headers=self.headers))
            if rep.xpath("//div[@class=\"c-company-list\"]"
                         ):  # 此部分提取规则未修改 -- 2019.12.16
                for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]:
                    detail_url = item.xpath(
                        "./div[@class=\"company-content-box\"]/div/div[1]/a/@href"
                    )
                    if str.split(detail_url[0], "/")[-2] not in self.load():
                        if len(detail_url) > 0:
                            url = "https://www.jobui.com" + detail_url[0]
                            try:
                                self.handle_data(
                                    self.util.get_req(url=url,
                                                      headers=self.headers))
                            except TimeoutError:
                                print("超时了!!!")
                            except Exception:
                                print("188 行出错了!!")
                                time.sleep(5)
                                self.handle_data(
                                    self.util.get_req(url=url,
                                                      headers=self.headers))
                            time.sleep(1)
                    else:
                        # print("该数据已入库")
                        pass
                    time.sleep(0.1)
                if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20:
                    return False
            else:
                print("该页无数据。。")
                return False
            print("第{}页抓取完毕!!".format(i))

    # 处理排列组合好后的列表页
    def parse_list(self, line):
        data_count = self.util.get_xpath_obj(
            self.util.get_req(url=line, headers=self.headers).text
        ).xpath(
            "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"
        )[0].strip()
        print("数量总计: " + data_count)
        if data_count:
            if int(data_count) > 1000:
                guimo = [
                    "少于50", "50-99", "100-499", "500-999", "1000-4999",
                    "5000-9999", "10000以上"
                ]
                for c in guimo:
                    print(c)
                    line = line + "&worker={}".format(self.util.url_encode(c))
                    print(line)
                    self.parse_list_page(line)
            else:
                self.parse_list_page(line)
        else:
            print("页面无数据!!!")

    # 处理公司信息
    def handle_data(self, res):
        print("-" * 100)
        print(res.request.url)
        # print(res.status_code)
        if res.status_code == 200:
            response = self.util.get_xpath_obj(res.text)
            if len(
                    response.xpath(
                        "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]"
                    )) == 3:  # 不确定有没有len() = 2 或是其他数量的情况
                title = response.xpath("//h1/a/text()")[0].strip().replace(
                    "\u2022", "")
                if response.xpath(
                        "//div[@class=\"company-banner-segmetation\"]/p/text()"
                ):
                    brief_intro = response.xpath(
                        "//div[@class=\"company-banner-segmetation\"]/p/text()"
                    )[0].strip()
                else:
                    brief_intro = ""
                xingzhi = "".join(
                    response.xpath(
                        "//div[@class=\"company-nature\"]/text()")).strip()
                guimo = "".join(
                    response.xpath(
                        "//div[@class=\"company-worker\"]/text()")).strip()
                hangye = ";".join([
                    i.strip() for i in response.xpath(
                        "//div[@class=\"company-info-item\"][2]/span/a/text()")
                ]).strip()
                # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0]
                quancheng = "".join([
                    i for i in response.xpath(
                        "//div[@class=\"company-info-item\"][3]/text()")
                    if len(i.strip()) > 1
                ]).strip()
                try:
                    intro = "".join(
                        response.xpath(
                            "//*[@id=\"textShowMore\"]/text()")).strip()
                except IndexError:
                    intro = ""
            else:
                title = ""
                brief_intro = ""
                xingzhi = ""
                guimo = ""
                hangye = ""
                quancheng = ""
                intro = ""
            id_code = self.util.MD5(quancheng)
            comp_code = str.split(res.request.url, "/")[-2]
            crawl_time = self.util.get_now_time()
            job_info = response.xpath(
                "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()"
            )[0].strip()
            if job_info == "///":
                job_count = 0
            else:
                job_count = int(job_info.replace("个", "").strip())
            job_count = job_count
            if job_count > 0:
                if job_count % 15 == 0:
                    page = int(job_count / 15) + 1
                else:
                    page = int(job_count / 15) + 2
                for i in range(1, page):
                    job_url = res.request.url + "jobs/p{}/".format(i)
                    self.handle_jobs(
                        self.util.get_req(url=job_url, headers=self.headers))
                    time.sleep(0.1)
            rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[
                0]  # 融资信息详情页地址,无域名
            if "financing" in rz:
                rongzi = response.xpath(
                    "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0]
                self.handle_rz_info(
                    self.util.get_req(url="https://www.jobui.com" + rz,
                                      headers=self.headers))
                time.sleep(0.1)
            else:
                rongzi = ""
            t = (
                id_code,
                title,
                brief_intro,
                xingzhi,
                guimo,
                hangye,
                rongzi,
                quancheng,
                pymysql.escape_string(intro),
                job_count,
                comp_code,
                crawl_time,
            )
            self.util.insert2mysql("(企业信息)" + title, self.sql_info(t))
            with open("./Scrapyed.txt", 'a', encoding="utf8") as f:
                f.write(str.split(res.request.url, "/")[-2] + "\n")
        else:
            print(res.status_code)
            return False

    # 处理招聘信息
    def handle_jobs(self, res):
        print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        while True:
            try:
                for item_node in response.xpath(
                        "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"
                ):
                    comp_code = str.split(res.request.url, "/")[-4]
                    crawl_time = self.util.get_now_time()
                    job_name = item_node.xpath("./div[1]/a/h3/text()")[0]
                    job_location = item_node.xpath(
                        "./div[2]/div/span[1]/text()")[0]
                    job_xueli = ""
                    job_year = ""
                    job_xingzhi = ""
                    job_money = ""
                    for p in item_node.xpath(
                            "./div[2]/div/span[2]/text()")[0].split(" | "):
                        if "在读" in p:
                            job_xueli = p
                        if p in [
                                "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上",
                                "应届毕业生"
                        ]:
                            job_xueli = p
                            continue
                        if "年" in p:
                            job_year = p
                            continue
                        if p in ["全职", "实习"]:
                            job_xingzhi = p
                            continue
                        for m in ["万", "元", "K", "-", "k", "~"]:
                            if m in p:
                                job_money = p
                                break
                    id_code = self.util.MD5(comp_code + job_name +
                                            job_location)
                    t_job = (id_code, job_name, job_location, job_xueli,
                             job_year, job_xingzhi, job_money, comp_code,
                             crawl_time)
                    self.util.insert2mysql(job_name, self.sql_job(t_job))
                break
            except Exception as e:
                print(e)
                time.sleep(10)

    # 处理融资信息
    def handle_rz_info(self, res):
        print("+" * 100)
        print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"):
        for rz_item in response.xpath(
                "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"
        ):
            try:
                rz_stage, money = str.split(
                    rz_item.xpath("./div/div/h3/text()")[0], ",")
                rz_money = money.strip()
            except IndexError:
                rz_stage = rz_money = ""
            try:
                # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。
                rz_edate, *people = str.split(
                    rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")
                    [0], ",")
                rz_compy = ";".join(str.split(people[0], ",")).strip()
            except IndexError:
                rz_edate = rz_compy = ""
            id_code = self.util.MD5(
                response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] +
                rz_stage)
            comp_code = str.split(res.request.url, "/")[-3]
            crawl_time = self.util.get_now_time()
            t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code,
                    crawl_time)
            self.util.insert2mysql(rz_stage, self.sql_rz(t_rz))

    def sql_info(self, tuple):
        sql_info = """
                    insert into tmp_jobui_info_n(id, title, brief_intro, 
                                        xingzhi, guimo, hangye, 
                                        rongzi, quancheng, 
                                        intro, job_count, comp_code, crawl_time) 
                                        values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                    """ % tuple
        return sql_info

    def sql_job(self, tuple):
        sql_job = """
                        insert into tmp_jobui_job_n(id, job_name, job_location, 
                                            job_xueli, job_year, 
                                            job_xingzhi, job_money, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s','%s','%s') 
                    """ % tuple
        return sql_job

    def sql_rz(self, tuple):
        sql_rz = """
                    insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, 
                                            rz_compy, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s') 
            """ % tuple
        return sql_rz
Exemplo n.º 7
0
class ZdbPedaily:
    def __init__(self):
        self.urls = [
            "https://zdb.pedaily.cn/enterprise/p{}/".format(i)
            for i in range(1, 770)
        ]
        self.util = Util()

        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "Cookie":
            "__uid=1452122016; "
            "__utmc=23980325; "
            "ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; "
            "BAIDU_SSP_lcr=https://www.baidu.com/link?url=LHrB83UJlUcy6-MhfY_1I-IRwU723Vl0YUkuCsVJ5MlEYZUAvU2Mv5jTfYQ2ZC0u&wd=&eqid=b0d97bf1000ba11a000000065d3018e2; "
            "Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1563415171,1563433191,1563523111; "
            "__utma=23980325.1444638820.1563415171.1563433192.1563523112.3; "
            "__utmz=23980325.1563523112.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; "
            "__fromtype=1; "
            "accesstoken=PQZUMOXSH2; "
            "Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; "
            "__utmb=23980325.10.10.1563523112",
            "Host":
            "zdb.pedaily.cn",
            "Referer":
            "https://zdb.pedaily.cn/",
            "Upgrade - Insecure - Requests":
            "1",
        }

    def get_shareholder(self, id_code, detail_html):
        shareholder_info = detail_html.xpath(
            "//table[@class=\"shareholder-info\"]/tbody/tr")
        if shareholder_info:
            for si in shareholder_info:
                shareholder_name = si.xpath("./td[1]/text()")[0]
                shareholder_type = si.xpath("./td[2]/text()")[0]
                if si.xpath("./td[3]/text()"):
                    shareholder_money = si.xpath("./td[3]/text()")[0]
                else:
                    shareholder_money = ""
                crawl_time = self.util.get_now_time()
                sql_sharholder = "insert into INV_EVT_ENTP_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \
                                 "values('%s', '%s', '%s', '%s','%s')" \
                                 % (id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time)

                self.util.insert2mysql("股东信息", sql_sharholder)

    def get_main_people(self, id_code, detail_html):
        main_people = detail_html.xpath(
            "//div[@class=\"business-people\"]/ul/li")
        if main_people:
            for p in main_people:
                mp_name = p.xpath("./h3/text()")[0]
                mp_position = p.xpath("./p/text()")[0]

                crawl_time = self.util.get_now_time()

                sql_main_people = "insert into INV_EVT_ENTP_MAIN_PSN_INF(ID,MAIN_PPL_NM,MAIN_PPL_POS,INPT_DT) " \
                                  "values('%s', '%s', '%s','%s')" % (id_code, mp_name, mp_position, crawl_time)
                self.util.insert2mysql("主要人物", sql_main_people)

    def get_detail_info(self, detail_url):
        detail_res = self.util.get_req(url=detail_url, headers=self.headers)
        print(detail_res.status_code)
        if detail_res.status_code == 200:
            detail_html = self.util.get_xpath_obj(detail_res)
            # 详情页信息获取
            company_name = detail_html.xpath("//h1/text()")[0]
            company_base = detail_html.xpath(
                "//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0]
            company_reg_loc = detail_html.xpath(
                "//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0]
            company_bound_date = detail_html.xpath(
                "//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0]
            company_industry = detail_html.xpath(
                "//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0]
            if detail_html.xpath(
                    "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()"
            ):
                company_site = detail_html.xpath(
                    "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()"
                )[0]
            else:
                company_site = ""
            if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/p/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/p/span/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/pre/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/div/div/text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/div/text()')[0]
            elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'):
                company_intro = detail_html.xpath(
                    '//div[@id="cke_pastebin"]//text()')[0]
            elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'):
                company_intro = detail_html.xpath(
                    '//div[@class="box-fix-l"]/ul/text()')[0]
            else:
                company_intro = ""

            if detail_html.xpath("//div[@id=\"business\"]"):
                legal_person = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0]
                founded_time = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0]
                registered_capital = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0]
                operational_authority = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0]
                registered_num = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0]
                approval_date = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0]

                organizational_code = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0]
                creditfcode = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0]
                identification_number = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0]
                registration_authority = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0]

                enterprise_type = detail_html.xpath(
                    "//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0]
            else:
                legal_person = ""
                founded_time = ""
                registered_capital = ""
                operational_authority = ""
                registered_num = ""
                approval_date = ""

                organizational_code = ""
                creditfcode = ""
                identification_number = ""
                registration_authority = ""
                enterprise_type = ""
            id_code = self.util.MD5(company_name + creditfcode)
            if detail_html.xpath("//*[@id=\"contact\"]"):
                contact = "".join(
                    detail_html.xpath(
                        "//*[@id=\"contact\"]/p//text()")).replace("'",
                                                                   "").strip()
            else:
                contact = ""
            # 融资事件 信息处理
            if detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"):
                for rz_html in detail_html.xpath(
                        "//div[@class=\"list-invest\"]/ul/li"):
                    if rz_html.xpath("./div[@class=\"view\"]/a/@href"
                                     )[0].startswith("http"):
                        rz_url = rz_html.xpath(
                            "./div[@class=\"view\"]/a/@href")[0]  # 融资事件新开页
                    else:
                        rz_url = "https://zdb.pedaily.cn" + rz_html.xpath(
                            "./div[@class=\"view\"]/a/@href")[0]  # 融资事件新开页
                    print(rz_url)
                    self.headers["Cookie"] = self.headers["Cookie"].format(
                        self.util.get_stamp())
                    rz_res = self.util.get_req(url=rz_url,
                                               headers=self.headers)
                    if rz_res.status_code == 200:
                        print("融资事件详情页请求成功")
                        rz_html = self.util.get_xpath_obj(rz_res.text)
                        # 投资事件 信息获取
                        rz_title = rz_html.xpath("//h1/text()")[0]
                        rz_info = "".join(
                            rz_html.xpath(
                                "//div[@class=\"info\"]/ul/li//text()"))
                        if rz_html.xpath("//div[@id=\"desc\"]/p/text()"):
                            rz_intro = rz_html.xpath(
                                "//div[@id=\"desc\"]/p/text()")[0]
                        else:
                            rz_intro = ""
                    else:
                        rz_title = ""
                        rz_info = ""
                        rz_intro = ""

                    crawl_time = self.util.get_now_time().replace("'", "")
                    sql_qyk = """insert into INV_EVT_ENTP_BAS_INF(
                                                                    ID
                                                                    ,CMP_NM
                                                                    ,ORG_TOT_DEPT
                                                                    ,REG_PLC_PNT
                                                                    ,CMP_SET_UP_TM
                                                                    ,AFL_IDT
                                                                    ,FORMAL_WEB
                                                                    ,CMP_INTRO
                                                                    ,LVRG_TTL
                                                                    ,LVRG_INF
                                                                    ,LVRG_INTRO
                                                                    ,LGP_RPRS
                                                                    ,SET_UP_TM
                                                                    ,REG_CPT
                                                                    ,OPR_RIT
                                                                    ,REG_NBR
                                                                    ,APRV_TM
                                                                    ,ORG_ORG_CD_NBR
                                                                    ,SOC_CRD_CD
                                                                    ,TAX_PSN_RCG_NBR
                                                                    ,REG_INSTT
                                                                    ,ENTP_TYP
                                                                    ,CTC_MTH
                                                                    ,INPT_DT

                                                        )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                                                        """ % (
                        id_code, company_name, company_base, company_reg_loc,
                        company_bound_date,
                        pymysql.escape_string(company_industry), company_site,
                        company_intro, rz_title, rz_info, rz_intro,
                        legal_person, founded_time, registered_capital,
                        operational_authority, registered_num, approval_date,
                        organizational_code, creditfcode,
                        identification_number, registration_authority,
                        enterprise_type, contact, crawl_time)
                    # print(sql_qyk)
                    self.util.insert2mysql("融资公司信息", sql_qyk)
            self.get_main_people(id_code, detail_html)
            self.get_shareholder(id_code, detail_html)

    def get_items_list(self, res):
        html = self.util.get_xpath_obj(res)
        for li in html.xpath("//ul[@id=\"enterprise-list\"]/li"):
            time.sleep(2)
            # 详情页获取
            if li.xpath("./div[1]/a/@href"):
                detail_url = "https://zdb.pedaily.cn" + li.xpath(
                    "./div[1]/a/@href")[0]  # 地址获取
            else:
                continue
            print(detail_url)
            self.get_detail_info(detail_url)

    def run(self):
        self.headers["Cookie"] = self.headers["Cookie"].format(
            self.util.get_stamp())
        for url in self.urls:
            print("列表页:" + url + "开始爬取")
            res = self.util.get_req(url=url, headers=self.headers)  # 列表页列表获取
            self.get_items_list(res)
Exemplo n.º 8
0
class JobuiProcess(object):
    def __init__(self):
        self.util = Util()
        self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3"
        self.headers = {
                            "Accept": "text/html,application/xhtml+xml,"
                            "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                            "Accept-Encoding": "gzip, deflate, br",
                            "Accept-Language": "zh-CN,zh;q=0.9",
                            "Cache-Control": "no-cache",
                            "Connection": "keep-alive",
                            "Host": "www.jobui.com",
                            "Pragma": "no-cache",
                            "Referer": "https://www.jobui.com/cmp",
                            "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1",
                            "Upgrade-Insecure-Requests": "1",
                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
        self.sleep_time = 0.3

        # 多进程初始化队列
        self.url_queue = JoinableQueue()
        self.resp_queue = JoinableQueue()
        self.item_queue = JoinableQueue()

        # mongo config
        self.mongo_host = "mongodb://*****:*****@class=\"j-change\"]/dd")[-1:]:  # 遍历多行dd(省份)
            for area in dd.xpath("./a")[-1:]:  # 遍历行内区域(市级)
                every_url = "https:" + area.xpath("./@href")[0]  # 按照城市列表分别请求和处理
                print(area.xpath("./text()")[0])
                # print("每个城市的url: " + every_url)
                self.parse_area_page(self.util.get_req(url=every_url, headers=self.headers))

    # 处理地区页面
    def parse_area_page(self, response):
        area_html = self.util.get_xpath_obj(response.text)
        hangye = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()")
        xingzhi = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()")
        guimo = ["少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上"]
        for a in hangye[1:]:
            for b in xingzhi[1:]:
                use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \
                          + "&type={}".format(self.util.url_encode(b))
                r = self.util.get_req(url=use_url, headers=self.headers)
                # time.sleep(self.sleep_time)
                if self.util.get_xpath_obj(r.text).xpath(
                        "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"):
                    data_count1 = self.util.get_xpath_obj(r.text).xpath(
                        "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip()
                    print("{}-{} 共有:{} 条数据".format(a, b, data_count1))
                    if int(data_count1) >= 1000:
                        for c in guimo:
                            use_url = use_url + "&worker={}".format(self.util.url_encode(c))
                            print(use_url)
                            r = self.util.get_req(url=use_url, headers=self.headers)
                            # time.sleep(self.sleep_time)
                            if self.util.get_xpath_obj(r.text).xpath(
                                    "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"):
                                data_count2 = self.util.get_xpath_obj(r.text).xpath(
                                    "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip()
                                print("{}-{}-{} 共有:{} 条数据".format(a, b, c, data_count2))
                                if int(data_count2) >= 1000:
                                    tese = self.util.get_xpath_obj(r.text).xpath(
                                        "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()")
                                    for d in tese[1:]:
                                        use_url = use_url + "&impression={}".format(self.util.url_encode(d))
                                        r = self.util.get_req(url=use_url, headers=self.headers)
                                        # time.sleep(self.sleep_time)
                                        if self.util.get_xpath_obj(r.text).xpath(
                                                "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"):
                                            data_count3 = self.util.get_xpath_obj(r.text).xpath(
                                                "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[
                                                1].strip()
                                            if int(data_count3) > 1000:
                                                print("排列组合后数据大于一千, 具体数量: " + data_count3)
                                            else:
                                                print("{}-{}-{}-{} 共有:{} 条数据".format(a, b, c, d, data_count3))
                                                self.parse_list_page(use_url)
                                        else:
                                            self.parse_list_page(use_url)
                                else:
                                    self.parse_list_page(use_url)
                    else:
                        self.parse_list_page(use_url)

    # 处理 每一个列表页的方法
    def parse_list_page(self, line):
        for i in range(1, 51):
            print("第{}页开始抓取".format(i))
            page_url = line + "&n={}".format(i)
            rep = self.util.get_xpath_obj(self.util.get_req(url=page_url, headers=self.headers))
            if rep.xpath("//div[@class=\"c-company-list\"]"):
                for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]:
                    detail_url = item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@href")
                    self.url_queue.put("https://www.jobui.com" + detail_url[0])  # 公司信息添加到url队列中。
                    # print("添加成功!!")
                if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20:
                    return False
            else:
                return False

    # 处理公司信息
    def handle_data(self):
        item = {}
        print("*" * 100)

        while True:
            try:
                time.sleep(self.sleep_time)
                url = self.url_queue.get()
                response = self.util.get_req(url=url, headers=self.headers)
                if response.status_code != 200:
                    self.url_queue.put(response.url)
            except Exception as e:
                raise e
            else:
                res_html = self.util.get_xpath_obj(response.text)
                if len(res_html.xpath(
                        "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]")) == 3:  # 不确定有没有len() = 2 或是其他数量的情况
                    item["title"] = res_html.xpath("//h1/a/text()")[0].strip().replace("\u2022", "")
                    if response.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()"):
                        item["brief_intro"] = res_html.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()")[0].strip()
                    else:
                            item["brief_intro"] = ""

                    item["xingzhi"] = "".join(res_html.xpath("//div[@class=\"company-nature\"]/text()")).strip()
                    item["guimo"] = "".join(res_html.xpath("//div[@class=\"company-worker\"]/text()")).strip()
                    item["hangye"] = ";".join([i.strip()
                                       for i in res_html.xpath("//div[@class=\"company-info-item\"][2]/span/a/text()")
                                       ]).strip()
                    item["quancheng"] = "".join([i for i in res_html.xpath("//div[@class=\"company-info-item\"][3]/text()")
                                         if len(i.strip()) > 1]).strip().replace("...", "")
                    try:
                        item["intro"] = "".join(res_html.xpath("//*[@id=\"textShowMore\"]/text()")).strip()
                    except IndexError:
                        item["intro"] = ""
                else:
                    item["title"] = ""
                    item["brief_intro"] = ""
                    item["xingzhi"] = ""
                    item["guimo"] = ""
                    item["hangye"] = ""
                    item["quancheng"] = ""
                    item["intro"] = ""
                item["id_code"] = self.util.MD5(item["quancheng"])
                item["comp_code"] = str.split(response.request.url, "/")[-2]
                item["crawl_time"] = self.util.get_now_time()
                job_info = res_html.xpath("//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()")[
                    0].strip()
                if job_info == "///":
                    job_count = 0
                else:
                    job_count = int(job_info.replace("个", "").strip())
                item["job_count"] = job_count
                if job_count > 0:
                    if job_count % 15 == 0:
                        page = int(item["job_count"] / 15) + 1
                    else:
                        page = int(item["job_count"] / 15) + 2
                    for i in range(1, page):
                        job_url = response.request.url + "jobs/p{}/".format(i)
                        self.handle_jobs(self.util.get_req(url=job_url, headers=self.headers))
                        time.sleep(0.1)
                rz = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[0]  # 融资信息详情页地址,无域名
                if "financing" in rz:
                    item["rongzi"] = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0]
                    self.handle_rz_info(self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers))
                    time.sleep(0.1)
                else:
                    item["rongzi"] = ""
                self.item_queue.put(item)
                # self.util.insert2mysql("(企业信息)" + title, self.sql_info(t))
                with open("./Scrapyed.txt", 'a', encoding="utf8") as f:
                    f.write(str.split(response.request.url, "/")[-2] + "\n")
            self.url_queue.task_done()  # 计数-1

    def insert2mongoDB(self, item):
        myclient = pymongo.MongoClient(self.mongo_host)
        mydb = myclient[self.mongo_client]
        mycol = mydb[self.mongo_db]
        x = mycol.insert_one(item)

    def save_item(self):
        while True:
            item = self.item_queue.get()
            self.insert2mongoDB(item)
            self.item_queue.task_done()

    # 处理招聘信息
    def handle_jobs(self, res):
        # print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        while True:
            try:
                for item_node in response.xpath(
                        "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"):
                    comp_code = str.split(res.request.url, "/")[-4]
                    crawl_time = self.util.get_now_time()
                    job_name = item_node.xpath("./div[1]/a/h3/text()")[0]
                    job_location = item_node.xpath("./div[2]/div/span[1]/text()")[0]
                    job_xueli = ""
                    job_year = ""
                    job_xingzhi = ""
                    job_money = ""
                    for p in item_node.xpath("./div[2]/div/span[2]/text()")[0].split(" | "):
                        if "在读" in p:
                            job_xueli = p
                        if p in ["初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生"]:
                            job_xueli = p
                            continue
                        if "年" in p:
                            job_year = p
                            continue
                        if p in ["全职", "实习"]:
                            job_xingzhi = p
                            continue
                        for m in ["万", "元", "K", "-", "k", "~"]:
                            if m in p:
                                job_money = p
                                break
                    id_code = self.util.MD5(comp_code + job_name + job_location)
                    t_job = (
                    id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code,
                    crawl_time)
                    self.util.insert2mysql(job_name, self.sql_job(t_job))
                break
            except Exception as e:
                print(e)
                time.sleep(10)

    # 处理融资信息
    def handle_rz_info(self, res):
        print("+" * 100)
        # print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"):
        for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"):
            try:
                rz_stage, money = str.split(rz_item.xpath("./div/div/h3/text()")[0], ",")
                rz_money = money.strip()
            except IndexError:
                rz_stage = rz_money = ""
            try:
                # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。
                rz_edate, *people = str.split(rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")[0], ",")
                rz_compy = ";".join(str.split(people[0], ",")).strip()
            except IndexError:
                rz_edate = rz_compy = ""
            id_code = self.util.MD5(response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage)
            comp_code = str.split(res.request.url, "/")[-3]
            crawl_time = self.util.get_now_time()
            t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time)
            self.util.insert2mysql(rz_stage, self.sql_rz(t_rz))

    def run(self):
        process_list = []
        # 构造url列表
        for _ in range(100):
            t_parse_url_list = Process(target=self.parse)
            t_parse_url_list.daemon = True
            t_parse_url_list.start()
            t_parse_url_list.join()

        # 发送请求,获取响应
        for i in range(5):
            ti_parse_url = Process(target=self.handle_data)
            process_list.append(ti_parse_url)

        for p in process_list:
            p.daemon = True  # 设置守护线程
            p.start()

        for q in [self.url_queue, self.resp_queue]:
            q.join()  # 让主线程阻塞,队列没释放之前不能结束任务

    def sql_info(self, tuple):
        sql_info = """
                    insert into tmp_jobui_info_n(id, title, brief_intro, 
                                        xingzhi, guimo, hangye, 
                                        rongzi, quancheng, 
                                        intro, job_count, comp_code, crawl_time) 
                                        values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                    """ % tuple
        return sql_info

    def sql_job(self, tuple):
        sql_job = """
                        insert into tmp_jobui_job_n(id, job_name, job_location, 
                                            job_xueli, job_year, 
                                            job_xingzhi, job_money, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s','%s','%s') 
                    """ % tuple
        return sql_job

    def sql_rz(self, tuple):
        sql_rz = """
                    insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, 
                                            rz_compy, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s') 
            """ % tuple
        return sql_rz
Exemplo n.º 9
0
class Jobui:
    def __init__(self):
        self.util = Util()
        self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,"
            "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Connection":
            "keep-alive",
            "Host":
            "www.jobui.com",
            "Pragma":
            "no-cache",
            "Referer":
            "https://www.jobui.com/cmp",
            "Cookie":
            "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        self.sleep_time = 0.1
        self.data_num = 0

    def load(self):
        if os.path.exists("Scrapyed.txt"):
            with open("Scrapyed.txt", 'r', encoding="utf8") as f:
                return f.read()
        else:
            print("文件不存在!!!!")

    # 处理数据的总方法
    def parse(self):
        req_area = self.util.get_req(url=self.url, headers=self.headers)
        res_html = self.util.get_xpath_obj(req_area.text)
        every_url = "https:" + res_html.xpath(
            "//dl[@class=\"j-change\"]/dd[11]/a[1]/@href")[0]  # 遍历多行dd(省份)
        self.data_num = 0
        print(
            res_html.xpath("//dl[@class=\"j-change\"]/dd[11]/a[1]//text()")[0])
        # print("每个城市的url: " + every_url)
        self.parse_area_page(
            self.util.get_req(url=every_url, headers=self.headers))
        print("此地区共抓取公司数量为:" + str(self.data_num))

    # 处理地区页面
    def parse_area_page(self, response):
        area_html = self.util.get_xpath_obj(response.text)
        hangye = area_html.xpath(
            "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()")
        xingzhi = area_html.xpath(
            "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()")
        guimo = [
            "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999",
            "10000以上"
        ]
        for a in hangye[1:]:
            for b in xingzhi[1:]:
                use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \
                                               + "&type={}".format(self.util.url_encode(b))
                # print(use_url)  # https://www.jobui.com/cmp?area=哈尔滨&industry=新能源&worker=10000以上&type=民营公司
                r = self.util.get_req(url=use_url, headers=self.headers)
                time.sleep(self.sleep_time)
                if self.util.get_xpath_obj(r.text).xpath(
                        "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                ):
                    data_count1 = self.util.get_xpath_obj(r.text).xpath(
                        "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                    )[1].strip()
                    print("{}-{} 共有:{} 条数据".format(a, b, data_count1))
                    if int(data_count1) >= 1000:
                        for c in guimo:
                            use_url = use_url + "&worker={}".format(
                                self.util.url_encode(c))
                            print(use_url)
                            r = self.util.get_req(url=use_url,
                                                  headers=self.headers)
                            time.sleep(self.sleep_time)
                            if self.util.get_xpath_obj(r.text).xpath(
                                    "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                            ):
                                data_count2 = self.util.get_xpath_obj(
                                    r.text
                                ).xpath(
                                    "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                                )[1].strip()
                                print("{}-{}-{} 共有:{} 条数据".format(
                                    a, b, c, data_count2))
                                if int(data_count2) >= 1000:
                                    tese = self.util.get_xpath_obj(
                                        r.text
                                    ).xpath(
                                        "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()"
                                    )
                                    for d in tese[1:]:
                                        use_url = use_url + "&impression={}".format(
                                            self.util.url_encode(d))
                                        r = self.util.get_req(
                                            url=use_url, headers=self.headers)
                                        time.sleep(self.sleep_time)
                                        if self.util.get_xpath_obj(
                                                r.text
                                        ).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                                                ):
                                            data_count3 = self.util.get_xpath_obj(
                                                r.text
                                            ).xpath(
                                                "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"
                                            )[1].strip()
                                            if int(data_count3) > 1000:
                                                print("排列组合后数据大于一千, 具体数量: " +
                                                      data_count3)
                                            else:
                                                print("{}-{}-{}-{} 共有:{} 条数据".
                                                      format(
                                                          a, b, c, d,
                                                          data_count3))
                                                self.parse_list_page(use_url)
                                        else:
                                            self.parse_list_page(use_url)
                                else:
                                    self.parse_list_page(use_url)
                    else:
                        self.parse_list_page(use_url)

    # 处理 每一个列表页的方法
    def parse_list_page(self, line):
        for i in range(1, 51):
            print("第{}页开始抓取".format(i))
            page_url = line + "&n={}".format(i)
            rep = self.util.get_xpath_obj(
                self.util.get_req(url=page_url, headers=self.headers))
            if rep.xpath("//div[@class=\"c-company-list\"]"
                         ):  # 此部分提取规则未修改 -- 2019.12.16
                for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]:
                    detail_url = item.xpath(
                        "./div[@class=\"company-content-box\"]/div/div[1]/a/@href"
                    )
                    self.data_num += 1
                    if str.split(detail_url[0], "/")[-2] not in self.load():
                        if len(detail_url) > 0:
                            url = "https://www.jobui.com" + detail_url[0]
                            try:
                                self.handle_data(
                                    self.util.get_req(url=url,
                                                      headers=self.headers))
                            except TimeoutError:
                                print("超时了!!!")
                            except Exception:
                                print("188 行出错了!!")
                                time.sleep(5)
                                self.handle_data(
                                    self.util.get_req(url=url,
                                                      headers=self.headers))
                            time.sleep(1)
                    else:
                        # print("{} 该数据已入库".format(item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@title")[0].replace("怎么样", "")))
                        pass
                    time.sleep(0.1)
                if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20:
                    return False
            else:
                print("该页无数据。。")
                return False
            print("第{}页抓取完毕!!".format(i))

    # 处理公司信息
    def handle_data(self, res):

        # print("-" * 100)
        # print(res.request.url)
        # print(res.status_code)
        if res.status_code == 200:
            response = self.util.get_xpath_obj(res.text)
            if len(
                    response.xpath(
                        "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]"
                    )) == 3:  # 不确定有没有len() = 2 或是其他数量的情况
                title = response.xpath("//h1/a/text()")[0].strip().replace(
                    "\u2022", "")
                if response.xpath(
                        "//div[@class=\"company-banner-segmetation\"]/p/text()"
                ):
                    brief_intro = response.xpath(
                        "//div[@class=\"company-banner-segmetation\"]/p/text()"
                    )[0].strip()
                else:
                    brief_intro = ""
                xingzhi = "".join(
                    response.xpath(
                        "//div[@class=\"company-nature\"]/text()")).strip()
                guimo = "".join(
                    response.xpath(
                        "//div[@class=\"company-worker\"]/text()")).strip()
                hangye = ";".join([
                    i.strip() for i in response.xpath(
                        "//div[@class=\"company-info-item\"][2]/span/a/text()")
                ]).strip()
                # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0]
                quancheng = "".join([
                    i for i in response.xpath(
                        "//div[@class=\"company-info-item\"][3]/text()")
                    if len(i.strip()) > 1
                ]).strip().replace("...", "")
                try:
                    intro = "".join(
                        response.xpath(
                            "//*[@id=\"textShowMore\"]/text()")).strip()
                except IndexError:
                    intro = ""
            else:
                title = ""
                brief_intro = ""
                xingzhi = ""
                guimo = ""
                hangye = ""
                quancheng = ""
                intro = ""
            id_code = self.util.MD5(quancheng)
            comp_code = str.split(res.request.url, "/")[-2]
            crawl_time = self.util.get_now_time()
            job_info = response.xpath(
                "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()"
            )[0].strip()
            if job_info == "///":
                job_count = 0
            else:
                job_count = int(job_info.replace("个", "").strip())
            job_count = job_count
            if job_count > 0:
                if job_count % 15 == 0:
                    page = int(job_count / 15) + 1
                else:
                    page = int(job_count / 15) + 2
                for i in range(1, page):
                    job_url = res.request.url + "jobs/p{}/".format(i)
                    self.handle_jobs(
                        self.util.get_req(url=job_url, headers=self.headers))
                    time.sleep(0.1)
            rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[
                0]  # 融资信息详情页地址,无域名
            if "financing" in rz:
                rongzi = response.xpath(
                    "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0]
                self.handle_rz_info(
                    self.util.get_req(url="https://www.jobui.com" + rz,
                                      headers=self.headers))
                time.sleep(0.1)
            else:
                rongzi = ""
            t = (
                id_code,
                title,
                brief_intro,
                xingzhi,
                guimo,
                hangye,
                rongzi,
                quancheng,
                pymysql.escape_string(intro),
                job_count,
                comp_code,
                crawl_time,
            )
            self.util.insert2mysql("(企业信息)" + title, self.sql_info(t))
            with open("./Scrapyed.txt", 'a', encoding="utf8") as f:
                f.write(str.split(res.request.url, "/")[-2] + "\n")
        else:
            print(res.status_code)
            return False

    # 处理招聘信息
    def handle_jobs(self, res):
        # print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        while True:
            try:
                for item_node in response.xpath(
                        "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"
                ):
                    comp_code = str.split(res.request.url, "/")[-4]
                    crawl_time = self.util.get_now_time()
                    job_name = item_node.xpath("./div[1]/a/h3/text()")[0]
                    job_location = item_node.xpath(
                        "./div[2]/div/span[1]/text()")[0]
                    job_xueli = ""
                    job_year = ""
                    job_xingzhi = ""
                    job_money = ""
                    for p in item_node.xpath(
                            "./div[2]/div/span[2]/text()")[0].split(" | "):
                        if "在读" in p:
                            job_xueli = p
                        if p in [
                                "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上",
                                "应届毕业生"
                        ]:
                            job_xueli = p
                            continue
                        if "年" in p:
                            job_year = p
                            continue
                        if p in ["全职", "实习"]:
                            job_xingzhi = p
                            continue
                        for m in ["万", "元", "K", "-", "k", "~"]:
                            if m in p:
                                job_money = p
                                break
                    id_code = self.util.MD5(comp_code + job_name +
                                            job_location)
                    t_job = (id_code, job_name, job_location, job_xueli,
                             job_year, job_xingzhi, job_money, comp_code,
                             crawl_time)
                    self.util.insert2mysql(job_name, self.sql_job(t_job))
                break
            except Exception as e:
                print(e)
                time.sleep(10)

    # 处理融资信息
    def handle_rz_info(self, res):
        print("+" * 100)
        # print(res.request.url)
        response = self.util.get_xpath_obj(res.text)
        # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"):
        for rz_item in response.xpath(
                "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"
        ):
            try:
                rz_stage, money = str.split(
                    rz_item.xpath("./div/div/h3/text()")[0], ",")
                rz_money = money.strip()
            except IndexError:
                rz_stage = rz_money = ""
            try:
                # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。
                rz_edate, *people = str.split(
                    rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")
                    [0], ",")
                rz_compy = ";".join(str.split(people[0], ",")).strip()
            except IndexError:
                rz_edate = rz_compy = ""
            id_code = self.util.MD5(
                response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] +
                rz_stage)
            comp_code = str.split(res.request.url, "/")[-3]
            crawl_time = self.util.get_now_time()
            t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code,
                    crawl_time)
            self.util.insert2mysql(rz_stage, self.sql_rz(t_rz))

    def sql_info(self, tuple):
        sql_info = """
                    insert into tmp_jobui_info_n(id, title, brief_intro, 
                                        xingzhi, guimo, hangye, 
                                        rongzi, quancheng, 
                                        intro, job_count, comp_code, crawl_time) 
                                        values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                    """ % tuple
        return sql_info

    def sql_job(self, tuple):
        sql_job = """
                        insert into tmp_jobui_job_n(id, job_name, job_location, 
                                            job_xueli, job_year, 
                                            job_xingzhi, job_money, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s','%s','%s') 
                    """ % tuple
        return sql_job

    def sql_rz(self, tuple):
        sql_rz = """
                    insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, 
                                            rz_compy, comp_code, crawl_time) 
                                            values('%s','%s','%s','%s','%s','%s','%s') 
            """ % tuple
        return sql_rz
Exemplo n.º 10
0
class WzzxbsMofocom:
    def __init__(self):
        self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action"
        self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}"
        self.headers = {
            "Accept": "application/json, text/javascript, */*",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "Content-Length": "169",
            "Content-Type": "application/x-www-form-urlencoded",
            "Cookie": "insert_cookie=32151754",
            "Host": "wzzxbs.mofcom.gov.cn",
            "Origin": "http://wzzxbs.mofcom.gov.cn",
            "Referer":
            "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        }
        self.detail_headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "insert_cookie=32151754",
            "Host":
            "wzzxbs.mofcom.gov.cn",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        self.data = {
            "params.entpName": "",
            "page.currentPage": "",
            "page.limit": "2000",
            "page.option": "next",
            "page.start": "",
            "page.rowCount": "",
            "listGrid.col": "1:showRecordInfo(0),2,3,4",
            "listGrid.type": "link,ro,ro,ro"
        }
        self.detail_data = {"params.recordId": "", "time": ""}
        self.util = Util()
        self.user_agent = UserAgent()

    def parse_18(self, detail_html, business_type):
        # 一、备案情况
        item_content = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace(
                "\xe5", "")  # 变更事项
        # print(item_content)
        item_date = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace(
                "\xe5", "")  # 完成备案时间
        # print(item_date)
        item_number = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace(
                "\xe5", "")  # 备案号
        # print(item_number)

        # 二、外商投资企业基本信息
        comp_name = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace(
                "\ue07e", "").replace("\xe5", "")  # 公司名称
        # print(comp_name)
        regi_addr = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace(
                '\u3bbe', '').replace('\ue07e',
                                      '').replace("\xe5",
                                                  "").replace("\ue096",
                                                              "")  # 注册地址
        # print(regi_addr)
        try:
            crit_code = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()"
            )[0].replace("\xe5", "")  # 统一社会信用代码
        except IndexError:
            crit_code = ""
        # print(crit_code)
        comp_type = re.findall(
            r'checked="checked"/> (.*?)&#13;',
            str(
                etree.tostring(detail_html.xpath(
                    "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0],
                               encoding='utf-8'),
                'utf-8').strip().replace("\xe5", ""), re.S)[0]  # 企业类型
        # print(comp_type)
        operating_period = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip(
            ).replace("\xe5", "")  # 经营期限
        # print(operating_period)
        try:
            investment_industry = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()"
            )[0].replace("\xe5", "")  # 投资行业
        except Exception:
            investment_industry = ""
        # print(investment_industry)
        business_scope = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace(
                "\xe5", "").replace("\xe5", "")  # 经营范围
        # print(business_scope)
        try:
            total_investment = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "").replace("\ue07e", "")
        except IndexError:
            total_investment = ""
        # print(total_investment)
        registered_capital = str.split(
            detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0],
            " ")[0].replace("\xa0", "").replace("\xe5",
                                                "").replace("\ue07e",
                                                            "")  # 注册资本
        # print(registered_capital)
        try:
            legal_representative = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ")  # 法定代表人
        except IndexError:
            legal_representative = ""
        # print(legal_representative)
        md5_id = comp_name + business_type + item_date + item_number
        cols = (self.util.MD5(item_number), business_type, item_content,
                item_date, item_number, comp_name, regi_addr, crit_code,
                comp_type, operating_period, investment_industry,
                business_scope, total_investment, registered_capital,
                pymysql.escape_string(legal_representative),
                self.util.MD5(md5_id), self.util.get_now_time())
        s = self.get_sql(cols)
        self.util.insert2mysql(comp_name, s)
        return md5_id, item_number

    def parse_17(self, detail_html, business_type):
        item_content = ""  # 变更事项
        item_date = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace(
                "\xe5", "")  # 完成备案时间
        # print(item_date)
        item_number = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace(
                "\xe5", "")  # 备案号
        # print(item_number)

        # 二、外商投资企业基本信息
        comp_name = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace(
                "\ue07e", "").replace("\xe5", "")  # 公司名称
        # print(comp_name)
        regi_addr = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace(
                '\u3bbe', '').replace('\ue07e',
                                      '').replace("\xe5",
                                                  "").replace("\ue096",
                                                              "")  # 注册地址
        # print(regi_addr)
        try:
            crit_code = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()"
            )[0].replace("\xe5", "")  # 统一社会信用代码
        except IndexError:
            crit_code = ""
        # print(crit_code)
        comp_type = re.findall(
            r'checked="checked"/> (.*?)&#13;',
            str(
                etree.tostring(detail_html.xpath(
                    "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0],
                               encoding='utf-8'), 'utf-8')
            # .replace("&#13;", "").replace("<input", "").replace("\n", "")
            .strip().replace("\xe5", ""),
            re.S)[0]  # 企业类型
        # print(comp_type)
        operating_period = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip(
            ).replace("\xe5", "")  # 经营期限
        # print(operating_period)
        try:
            investment_industry = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()"
            )[0].replace("\xe5", "")  # 投资行业
        except Exception:
            investment_industry = ""
        # print(investment_industry)
        business_scope = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace(
                "\xe5", "").replace("\xe5", "")  # 经营范围
        # print(business_scope)
        try:
            total_investment = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "")  # 投资总额
        except IndexError:
            total_investment = ""
        # print(total_investment)
        registered_capital = str.split(
            detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0],
            " ")[0].replace("\xa0", "").replace("\xe5", "")  # 注册资本
        # print(registered_capital)
        try:
            legal_representative = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xd6", "").replace("\xe5", "")  # 法定代表人
        except IndexError:
            legal_representative = ""
        # print(legal_representative)
        md5_id = comp_name + business_type + item_date + item_number
        cols = (self.util.MD5(item_number), business_type, item_content,
                item_date, item_number, comp_name, regi_addr, crit_code,
                comp_type, operating_period, investment_industry,
                business_scope, total_investment, registered_capital,
                pymysql.escape_string(legal_representative),
                self.util.MD5(md5_id), self.util.get_now_time())
        self.util.insert2mysql(comp_name, self.get_sql(cols))
        return md5_id, item_number

    def get_sql(self, col_tuple):
        info_sql = """
                            insert into wzzxbs_mofcom_info(
                            id,
                            business_type,
                            item_content,
                            item_date,
                            item_number,
                            comp_name,
                            regi_addr,
                            crit_code,
                            comp_type,
                            operating_period,
                            investment_industry,
                            business_scope,
                            total_investment,
                            registered_capital,
                            legal_representative,
                            cust_id,
                            craw_time
                            )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                        """ % col_tuple
        return info_sql

    def parse_invesment_info(self, detail_html, md5_id, n):
        for mes in detail_html.xpath(
                "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format(
                    n))[1:]:
            name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\
                .replace("\ue07e", "")\
                .replace("\xe5", "")\
                .replace("\xd6", "")
            # print(name_of_investor)
            different_countries = mes.xpath("./td[2]/text()")[0].replace(
                "\xe5", "")
            # print(different_countries)
            amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\
                .replace("\xa0", "")\
                .replace("\xd6", "")\
                .replace("\xe5", "")\
                .replace("\ue07e", "")
            # print(amount_invested)
            investment_sql = """
                insert into wzzxbs_mofcom_investment_info(
                id,
                name_of_investor,
                different_countries,
                amount_invested,
                cust_id,
                craw_time
                )values('%s', '%s', '%s', '%s', '%s', '%s')
            """ % (self.util.MD5(name_of_investor + different_countries +
                                 amount_invested),
                   pymysql.escape_string(name_of_investor),
                   different_countries, amount_invested, self.util.MD5(md5_id),
                   self.util.get_now_time())
            self.util.insert2mysql("投资信息|", investment_sql)

    def parse(self, num):
        self.data["page.currentPage"] = str(num)
        if num:
            self.data["page.start"] = str((int(num) - 1) * 2000)
        while True:
            try:
                page_req = requests.post(url=self.url,
                                         headers=self.headers,
                                         data=self.data)
                items = self.util.get_json_obj(page_req.text)["rows"]
                page_req.close()

                for item in items:  # item
                    business_type = item["data"][1]
                    item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)',
                                           item["data"][0])[0]
                    detail_url = self.detail_base_url.format(
                        item_code, self.util.get_stamp())  # 详情页请求连接
                    print(detail_url)
                    self.detail_data["params.recordId"] = item_code
                    self.detail_data["time"] = self.util.get_stamp()
                    while True:
                        try:
                            detail_req = requests.get(
                                url=detail_url,
                                headers=self.detail_headers,
                                data=self.detail_data)  # 详情页请求
                            detail_html = self.util.get_xpath_obj(
                                detail_req.text)
                            detail_req.close()
                            if len(
                                    detail_html.xpath(
                                        "//div[@class=\"Table1\"]/table[1]/tr")
                            ) == 18:
                                try:
                                    md5_id, item_number = self.parse_18(
                                        detail_html, business_type)
                                    self.parse_invesment_info(
                                        detail_html, md5_id, 18)
                                except Exception as e18:
                                    print("e18" + str(e18))
                                    print("问题在此处被捕获了")
                            else:
                                try:
                                    md5_id, item_number = self.parse_17(
                                        detail_html, business_type)
                                    # 三、外商投资企业投资者基本信息
                                    self.parse_invesment_info(
                                        detail_html, md5_id, 17)
                                except Exception as e17:
                                    print("e17" + str(e17))
                                    print("问题在此处被捕获了")
                            break
                        except requests.exceptions.ChunkedEncodingError as e:
                            print("e" + str(e))
                        except Exception as e1:
                            print("e1" + str(e1))
                            print("==>远程关闭连接,休息等待中。。。")
                            time.sleep(300)
                    time.sleep(1.5)
                break
            except requests.exceptions.ChunkedEncodingError as e2:
                print("e2" + str(e2))
            except Exception as e3:
                print("e3" + str(e3))
                print("=====>远程关闭连接,休息等待中。。。")
                time.sleep(300)

    def main(self):
        req = requests.post(url=self.url, headers=self.headers,
                            data=self.data)  # 初始数据请求
        res_json = self.util.get_json_obj(req.text)
        self.data["page.rowCount"] = res_json["rowCount"]
        for i in range(29, int(res_json["rowCount"])):
            print("#####{}#####".format(i))
            self.parse(i)
            time.sleep(30)