Exemplo n.º 1
0
class ShunqiSpider:
    def __init__(self):
        self.start_url = 'https://b2b.11467.com/'
        self.headers = b"""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: no-cache
Connection: keep-alive
Cookie: Hm_lvt_819e30d55b0d1cf6f2c4563aa3c36208=1616553403,1617870200; Hm_lpvt_819e30d55b0d1cf6f2c4563aa3c36208=1617870504; arp_scroll_position=400
Host: b2b.11467.com
Pragma: no-cache
Referer: https://www.11467.com/
sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"
sec-ch-ua-mobile: ?0
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-site
Sec-Fetch-User: ?1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"""
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "shunqiwang")
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.area_name_list = []

    def Get_res(self, url, headers):
        #返回可以xpath的对象的get
        # html = requests.get(url=url, headers=headers_raw_to_dict(headers))
        html = self.f.fetch(url=url, headers=headers_raw_to_dict(headers))
        res = etree.HTML(html.text)
        return res

    def get_area(self):
        res = self.Get_res(url=self.start_url, headers=self.headers)

        area_list = res.xpath(
            '//div[@class="box sidesubcat t5"]//div[@class="boxtitle"]//following-sibling::div[@class="boxcontent"]//dl[@class="listtxt"]//dd/a/@href'
        )
        area_name_list = res.xpath(
            '//div[@class="box sidesubcat t5"]//div[@class="boxtitle"]//following-sibling::div[@class="boxcontent"]//dl[@class="listtxt"]//dd/a/text()'
        )
        #"//www.11467.com/shenzhen/"
        #https://www.11467.com/shenzhen/

        for i in range(len(area_list)):
            real_url = "https:" + area_list[i]
            area_name = area_name_list[i]
            self.r2.save_category_url(area_name, real_url)
            self.area_name_list.append(area_name)

    def get_sec_category(self):
        for i in self.area_name_list:
            url = self.r2.get_category_url(i)
            res = self.Get_res(url=url, headers=self.headers)
            sec_url_list = res.xpath(
                '//div[@id="il"]//div[@class="box huangyecity t5"]//div[@class="boxcontent"]//ul//li//dl//dt//a/@href'
            )
            for url in sec_url_list:
                self.r2.save_page_url(i, url)
Exemplo n.º 2
0
 def __init__(self):
     self.start_url = 'http://www.98pz.com/t59c11s1/1.html'
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
     }
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.f = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "98guakao_hz_qz")
Exemplo n.º 3
0
 def __init__(self):
     self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/'
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
     }
     self.f = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb")
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.r3 = Redisclient(3)
     self.category_name_list = []
     self.sec_category_dict = {}
     self.headers_forpage = {
         "Host": "www.80guakao.com",
         "Connection": "keep-alive",
         "Pragma": "no-cache",
         "Cache-Control": "no-cache",
         "User-Agent":
         "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36",
         "Accept": "*/*",
         "Referer": "http://www.80guakao.com/shengfen/hb/",
         "Accept-Encoding": "gzip,deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cookie": "",
     }
Exemplo n.º 4
0
    def __init__(self, start_url, cookie, referer, companyCity,
                 companyProvince, db):
        self.start_url = start_url
        self.companyCity = companyCity
        self.companyProvince = companyProvince
        self.headers = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            cookie,
            # "Cookie": "",
            "pragma":
            "no-cache",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "Referer":
            referer,
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db)
        # self.f = FETCH()
        self.par = re.compile(r'\d+-\d+')
        self.par2 = re.compile(r'\d+')
Exemplo n.º 5
0
 def __init__(self):
     self.starturl = 'http://hangzhou.qd8.com.cn/'
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
     self.s = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao")
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.r3 = Redisclient(3)
     self.item_dict = {}
     self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8')
Exemplo n.º 6
0
    def __init__(self):
        self.start_url = 'https://b2b.11467.com/'
        self.headers = b"""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: no-cache
Connection: keep-alive
Cookie: Hm_lvt_819e30d55b0d1cf6f2c4563aa3c36208=1616553403,1617870200; Hm_lpvt_819e30d55b0d1cf6f2c4563aa3c36208=1617870504; arp_scroll_position=400
Host: b2b.11467.com
Pragma: no-cache
Referer: https://www.11467.com/
sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"
sec-ch-ua-mobile: ?0
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-site
Sec-Fetch-User: ?1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"""
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "shunqiwang")
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.area_name_list = []
Exemplo n.º 7
0
 def __init__(self):
     self.statrurl = 'https://www.logoids.com/tags/diqu/1/'
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
     self.data_demo = {'_id': '', 'category_name': '', 'brand_name': '', 'logo_url': '', }
     self.m = Mongoclient()
     # self.f=FETCH()
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.category_list = []
Exemplo n.º 8
0
 def __init__(self):
     # 起始url
     self.starturl = 'http://chengdu.atobo.com/'
     # 数据类型模板
     self.data_demo = {
         '_id': '',
         'category_name': '',
         'company_name': '',
         'company_phone': '',
         'company_address': ''
     }
     # 封装的自带代理ip池的请求
     self.f = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD_atb_chengdu')
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.r3 = Redisclient(3)
     self.category_list = []
Exemplo n.º 9
0
    def __init__(self):
        self.start_url = 'http://www.9gk.cc/zp/sichuan/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }

        self.headers_fordata = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            "Hm_lvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1612144130,1612399856,1612752316,1613704044; Hm_lpvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1613704100",
            "Connection":
            "keep-alive",
            "Host":
            "www.9gk.cc",
            "pragma":
            "no-cache",
            "Referer":
            "http://www.9gk.cc/zp/p1700",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_chengdu")
Exemplo n.º 10
0
class Gkspider:
    def __init__(self):
        self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb")
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.category_name_list = []
        self.sec_category_dict = {}
        self.headers_forpage = {
            "Host": "www.80guakao.com",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "User-Agent":
            "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36",
            "Accept": "*/*",
            "Referer": "http://www.80guakao.com/shengfen/hb/",
            "Accept-Encoding": "gzip,deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cookie": "",
        }

    def get_category(self):
        html = self.f.fetch(url=self.starturl,
                            headers=self.headers,
                            method='get')
        # html = requests.get(url=self.starturl, headers=self.headers)
        sleep(random.randint(0, 1))
        res = etree.HTML(html.text)
        # print(html.text)

        # category_url_list = res.xpath('//div[@class="content"]//div//a')
        # # if len(category_url_list) > 19:
        # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a')
        category_url_list = res.xpath(
            '//div[@class="categories"]//ul//li[1]//dd[1]//a')
        for i in category_url_list:
            category_name = i.xpath('./text()')[0]
            category_url = i.xpath('./@href')[0]
            category_url = category_url.replace('m.', 'www.')
            if category_name != "不限":
                self.r0.save_category_url(category_name, category_url)
                self.category_name_list.append(category_name)

    def get_sec_category(self):
        for category_name in self.category_name_list:

            url = self.r0.get_category_url(category_name)
            # html = self.f.fetch(url=url,headers=self.headers,method='get')
            html = requests.get(url=url, headers=self.headers_forpage)
            sleep(random.randint(0, 1))
            res = etree.HTML(html.text)

            sec_category_list = res.xpath('//div[@class="content"]//div//a')
            # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a')

            for i in sec_category_list:
                sec_category_name = i.xpath('./text()')[0]
                sec_category_url = i.xpath('./@href')[0]
                sec_category_url = sec_category_url.replace('m.', 'www.')
                if sec_category_name != '不限':
                    print(sec_category_name)
                    self.r1.save_one_dict(category_name, sec_category_name,
                                          sec_category_url)

    def get_all_page(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)

            for sec_category_name, url in sec_category_list.items():
                # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get')
                html = requests.get(url=url.decode(),
                                    headers=self.headers_forpage)
                sleep(random.randint(0, 1))
                res = etree.HTML(html.text)
                self.r2.save_page_url(
                    category + ":" + sec_category_name.decode(), url.decode())
                while True:
                    try:
                        next_page = res.xpath(
                            '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'
                        )[0]
                    except:
                        break
                    if not next_page:
                        break

                    self.r2.save_page_url(
                        category + ":" + sec_category_name.decode(), next_page)
                    html_next = self.f.fetch(url=next_page,
                                             headers=self.headers_forpage,
                                             method='get')
                    # html_next = requests.get(url=next_page, headers=self.headers_forpage)
                    sleep(random.randint(0, 1))
                    res = etree.HTML(html_next.text)

    def get_item_url(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r2.get_page_url(category + ":" +
                                                   sec_category_name.decode())
                        # html = self.f.fetch(url=url, headers=self.headers,method='get')
                        html = requests.get(url=url,
                                            headers=self.headers_forpage)
                        sleep(random.randint(1, 2))
                        res = etree.HTML(html.text)
                    except Exception as e:
                        print('error:', e)
                        break
                    # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href')
                    item_list = res.xpath(
                        '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'
                    )

                    for item_url in item_list:
                        # if 'tel' not in item_url:
                        #     url = item_url.replace('m.', 'www.') #每个数据url
                        if 'http' not in item_url:
                            item_url = 'http://www.80guakao.com/' + item_url
                        self.r3.save_item_url(
                            category + ':' + sec_category_name.decode(),
                            item_url)

    def get_info(self):
        # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0])  # 姓名
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r3.get_item_url(category + ":" +
                                                   sec_category_name.decode())

                        html = requests.get(url=url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                        if html.status_code != 200:
                            html = self.f.fetch(url=url.decode(),
                                                headers=self.headers_forpage,
                                                method='get')
                            sleep(random.randint(0, 1))
                        res = etree.HTML(html.text)

                    except:
                        break
                    item = {}
                    # try:
                    #     company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0]
                    # except:
                    try:
                        company_name = res.xpath(
                            '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()'
                        )[0]

                    except:
                        company_name = 'None'

                    # try:
                    #     contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0]
                    #     contact_people = contact_people.replace(r'\xa0\xa0','')
                    #
                    # except:
                    contact_people = res.xpath(
                        '//ul[@class="contacter"]//li//font/text()')[0]

                    # try:
                    #     perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0]
                    # except:
                    #
                    #     perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0]
                    #

                    # try:
                    #     phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0]
                    #     if phone == []:
                    #         raise  Exception
                    # except:

                    # try:
                    phone_url_re = res.xpath(
                        '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick'
                    )[0]

                    par = re.compile("'.*?'")
                    phone_url = re.findall(par, phone_url_re)[1].replace(
                        "'", "")  # 电话号码url

                    if type(phone_url) == str:
                        html = requests.get(url=phone_url,
                                            headers=self.headers_forpage)
                    else:
                        html = requests.get(url=phone_url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                    res = etree.HTML(html.text)
                    phone = res.xpath(
                        '//div[@class="number"]//span[@class="num"]/text()')[0]
                    # except:
                    #     phone = "None"

                    item['companyCity'] = '宜昌'
                    item['companyProvince'] = '湖北省'
                    item['code'] = 'BUS_YT_ZZ'
                    item['name'] = '资质'
                    item['busCode'] = ''
                    item['webUrl'] = '无'
                    item['orgId'] = ''
                    # 部门ID 字符串
                    item['deptId'] = ''
                    # 中心ID 字符串
                    item['centreId'] = ''
                    # item["first_category"] = category
                    # item["sec_category"] = sec_category_name.decode()
                    item["companyName"] = company_name
                    item["outName"] = contact_people
                    item[
                        "resourceRemark"] = category + ":" + sec_category_name.decode(
                        )
                    item["companyTel"] = phone.strip()
                    if len(contact_people) == 11:
                        item["companyTel"] = contact_people
                    item["ibossNum"] = None
                    item['isDir'] = 0
                    item['isShare'] = 0
                    item["_id"] = md5encryption(item["companyTel"])
                    print(item)
                    self.m.mongo_add(item)

    def test(self):
        url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html'
        html = requests.get(url=url, headers=self.headers_forpage)
        print(html.text)
        res = etree.HTML(html.text)
        # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'))
        # print(res.xpath('//div[@class="content"]//div//a/text()'))
        # print(html.text)
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href'))
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求
        # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人
        phone_url_re = res.xpath(
            '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0]  #电话号码

        print(phone_url_re)
        par = re.compile("'.*?'")
        phone_url = re.findall(par, phone_url_re)[1].replace("'", "")  #电话号码url
        html = requests.get(url=phone_url, headers=self.headers_forpage)
        res = etree.HTML(html.text)
        phone = res.xpath(
            '//div[@class="number"]//span[@class="num"]/text()')[0]
        print(phone)
        #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc=
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'))

    def run(self):
        self.get_category()
        self.get_sec_category()
        self.get_all_page()
        self.get_item_url()
        self.get_info()
Exemplo n.º 11
0
class JanzhuSpider():
    def __init__(self, start_url, cookie, referer, companyCity,
                 companyProvince, db):
        self.start_url = start_url
        self.companyCity = companyCity
        self.companyProvince = companyProvince
        self.headers = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            cookie,
            # "Cookie": "",
            "pragma":
            "no-cache",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "Referer":
            referer,
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db)
        # self.f = FETCH()
        self.par = re.compile(r'\d+-\d+')
        self.par2 = re.compile(r'\d+')

    def parse_next_page(self):
        self.r0.save_page_url(category_name="北京", page_url=self.start_url)
        # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get')
        html = requests.get(url=self.start_url, headers=self.headers)
        sleep(2)
        while True:
            res = etree.HTML(html.text)
            try:
                next_page = res.xpath(
                    '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href'
                )
                print(next_page)
                next_page = 'https://www.cbi360.net' + next_page[0]
            except Exception as e:
                print(e)
                print(html.text)
                break
            self.r0.save_page_url(category_name="北京", page_url=next_page)
            self.parse_item(res)
            # html = self.f.fetch(url=next_page, headers=self.headers, method='get')
            html = requests.get(url=next_page, headers=self.headers)
            sleep(1)

    def re_phone(self, target):
        try:
            phone = re.findall(self.par, target)[0]
        except:
            print(target)
            try:
                phone = re.findall(self.par2, target)[0]
            except:
                phone = ''
        return phone

    def parse_item(self, res):
        # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]
        # while True:
        # try:
        #     # url = self.r0.get_page_url(category_name='北京')
        #     # html = self.f.fetch(url=url, headers=self.headers, method='get')
        #     # html = requests.get(url=url, headers=self.headers)
        # except:
        #     continue
        sleep(1)
        # res = etree.HTML(html.text)
        companyName_list = res.xpath(
            '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()'
        )
        phone_list = res.xpath(
            '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()'
        )
        for i in range(len(companyName_list)):
            item = {}
            companyName = companyName_list[i]
            phone = self.re_phone(phone_list[i])
            if is_phone(phone):
                item['companyCity'] = self.companyCity
                item['companyProvince'] = self.companyProvince
                item['code'] = 'BUS_YT_ZZ'
                item['name'] = '资质'
                item['busCode'] = ''
                item['webUrl'] = '无'
                item['orgId'] = ''
                item['deptId'] = ''
                item['centreId'] = ''
                item["companyName"] = companyName
                item["outName"] = ''
                item["resourceRemark"] = ''
                item["companyTel"] = phone
                item["ibossNum"] = None
                item['isDir'] = 0
                item['isShare'] = 0
                item["_id"] = md5encryption(item["companyTel"])
                item["flag"] = 0
                print(item)
                self.m.mongo_add(item)

    def run(self):
        self.parse_next_page()
Exemplo n.º 12
0
class Atb_spider:
    def __init__(self):
        # 起始url
        self.starturl = 'http://chengdu.atobo.com/'
        # 数据类型模板
        self.data_demo = {
            '_id': '',
            'category_name': '',
            'company_name': '',
            'company_phone': '',
            'company_address': ''
        }
        # 封装的自带代理ip池的请求
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD_atb_chengdu')
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.category_list = []

    def parse_category_html(self):
        # 解析种类页面获取种类名称和种类url
        # 存储到redis0中,字符串类型,以种类名为名,以种类url为值
        html = self.f.fetch(self.starturl)
        response = etree.HTML(html.text)
        category_list = response.xpath(
            '//div[@class="sidebar-category"]/ul//li/p[@class="pcategory_son"]/a'
        )
        for category in category_list:
            category_name = category.xpath('./text()')[0]
            category_url = category.xpath('./@href')
            category_url = 'http:' + category_url[0]
            self.category_list.append(category_name)
            self.r0.save_category_url(category_name, category_url)

    def parse_more_html(self):
        # 拿到每个种类更多页url
        # 存储到redis1中,字符串类型,以种类名为名,以更多种类url为值
        for category_name in self.category_list:
            url = self.r0.get_category_url(category_name)
            html = self.f.fetch(url)
            response = etree.HTML(html.text)
            more_company_url = response.xpath(
                '//div[@class="product-list-more"]/a/@href')[0].split('//')[1]
            self.r1.save_category_url(category_name, more_company_url)
            self.r0.del_r0_item(category_name)

    def parse_all_page(self):
        # 获取所有页的url
        # 以种类为名,以所有页url为值,所有页为列表
        for category_name in self.category_list:
            first_page_url = self.r1.get_category_url(category_name)
            # url是第一页url
            html = self.f.fetch(url=first_page_url)
            response = etree.HTML(html)
            self.r2.save_page_url(category_name, first_page_url)
            while True:
                # 下一页url
                try:
                    next_page_url = response.xpath(
                        '//div[@class="pagelist"]//span[@class="page_next page-n"]/a/@href'
                    )[0]
                    if next_page_url:
                        print(next_page_url)
                        self.r2.save_page_url(category_name=category_name,
                                              page_url=next_page_url)
                        # name对全部页url,一对列表
                        html = self.f.fetch(next_page_url)
                        response = etree.HTML(html.text)
                    else:
                        break
                except:
                    break

    def parse_one_url(self):
        # 获取一页当中每个企业的url list
        for category_name in self.category_list:
            all_page_url_list = self.r2.get_page_url(category_name)
            for one_page_url in all_page_url_list:
                html = self.f.fetch(one_page_url)
                response = etree.HTML(html)
                info_list = response.xpath()

        # response = etree.HTML(html)
        # one_url_list = response.xpath(
        #     '//li[@class="product_box"]//li[@class="pp_name"]//a[@class="CompanyName"]/@href')
        # for one_url in one_url_list:
        #     one_url = "http://www.atobo.com/" + one_url

    def run(self):
        self.parse_category_html()
        self.parse_more_html()
        self.parse_all_page()
Exemplo n.º 13
0
class Spider9():
    def __init__(self):
        self.start_url = 'http://www.9gk.cc/zp/sichuan/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }

        self.headers_fordata = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            "Hm_lvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1612144130,1612399856,1612752316,1613704044; Hm_lpvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1613704100",
            "Connection":
            "keep-alive",
            "Host":
            "www.9gk.cc",
            "pragma":
            "no-cache",
            "Referer":
            "http://www.9gk.cc/zp/p1700",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_chengdu")

    def get_category_url(self):
        for i in range(14):
            url = self.start_url + "p" + str(1700 + i)
            self.r0.save_page_url("上海", url)

    def get_all_page(self):
        while True:
            try:
                url = self.r0.get_page_url("上海")
            except:
                break
            #/page/2
            self.r1.save_page_url("上海", url)
            try:
                html = requests.get(url=url, headers=self.headers)
            except:
                break
            print(html.text)
            res = etree.HTML(html.text)
            try:
                last_page = res.xpath(
                    '//ul[@class="pagination"]//li[@class="disable"]//following-sibling::li//a/text()'
                )
                if last_page == []:
                    last_page = list(
                        res.xpath('//ul[@class="pagination"]//li//a/text()')
                        [-1])
            except Exception as e:
                print(e)
                break
            for i in range(2, int(last_page[0]) + 1):
                page_url = str(url, "utf-8") + r'/page/{}'.format(i)
                self.r1.save_page_url("上海", page_url)

    def parse_item_url(self):
        #//div[@class="col-xs-12 boxshadow"]//div[@class="col-lg-12 bk-btm-xuxian pad-10"]//div[@class="col-lg-5 pad-left20"]//a/@href
        while True:
            try:
                url = self.r1.get_page_url("上海")
                html = requests.get(url=url, headers=self.headers)
            except Exception as e:
                break

            # print(html.text)
            res = etree.HTML(html.text)
            # item_url_list = res.xpath('//div[@class="col-xs-12  boxshadow "]//div[@class="col-lg-12 bk-btm-xuxian pad-10"]//div[@class="col-lg-5 pad-left20"]//a/@href')
            item_url_list = res.xpath(
                '/html/body/div[5]/div/div/div/span/a/@href')
            for i in range(len(item_url_list)):
                print(item_url_list[i])
                self.r2.save_page_url("上海", item_url_list[i])

    def parse_data(self):
        while True:
            try:
                url = self.r2.get_page_url("上海")
                print(url)
            except:
                break
            headers = self.headers_fordata
            headers["Referer"] = url
            html = requests.get(url=url, headers=headers)
            res = etree.HTML(html.text)
            try:
                outName = res.xpath(
                    '/html/body/div[3]/div[1]/div[2]/div[4]/text()')[0]
                phone = res.xpath(
                    '/html/body/div[3]/div[1]/div[2]/div[6]/span/text()')[0]
                companyName = res.xpath(
                    '/html/body/div[3]/div[1]/div[1]/h2/text()')[0]
            except:
                continue
            if is_phone(phone):
                if "企业管理" not in str(companyName):
                    print(companyName)
                    item = {}
                    item['companyCity'] = "成都"
                    item['companyProvince'] = "四川省"
                    item['code'] = 'BUS_YT_ZZ'
                    item['name'] = '资质'
                    item['busCode'] = ''
                    item['webUrl'] = '无'
                    item['orgId'] = ''
                    item['deptId'] = ''
                    item['centreId'] = ''
                    item["companyName"] = companyName
                    item["outName"] = outName
                    item["resourceRemark"] = ''
                    item["companyTel"] = str(phone)
                    item["ibossNum"] = None
                    item['isDir'] = 0
                    item['isShare'] = 0
                    item["_id"] = md5encryption(item["companyTel"])
                    # item["flag"] = 0
                    print(item)
                    self.m.mongo_add(item)
            else:
                continue

    def run(self):
        self.get_category_url()
        self.get_all_page()
        self.parse_item_url()
        self.parse_data()

    def test(self):
        pass
Exemplo n.º 14
0
class Spider98_zhaoping:
    def __init__(self):
        self.start_url = 'http://www.98pz.com/t59c11s1/1.html'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "98guakao_hz_qz")

    def parse_next_page(self):
        self.r0.save_page_url(category_name='杭州求职', page_url=self.start_url)
        html = requests.get(url=self.start_url, headers=self.headers)
        sleep(0.5)
        while True:
            res = etree.HTML(html.text)
            try:
                next_page_url = res.xpath(
                    '//div[@class="pager"]//a[@class="next"]/@href')[0]
            except:
                break
            next_page_url = 'http://www.98pz.com/' + next_page_url
            print(next_page_url)
            self.r0.save_page_url(category_name='杭州求职', page_url=next_page_url)
            html = requests.get(url=next_page_url, headers=self.headers)

    def parse_item_url(self):
        while True:
            url = self.r0.get_page_url(category_name='杭州求职')
            try:
                html = requests.get(url=url, headers=self.headers)
                sleep(0.5)
            except:
                break
            res = etree.HTML(html.text)
            item_url_list = res.xpath('//td[@class="t"]//a[1]')[:-1]
            for one in item_url_list:
                url = one.xpath('./@href')[0]
                self.r1.save_item_url(category_name='杭州求职', url=url)

    def parse_data(self):
        while True:
            item = {}
            url = self.r1.get_item_url(category_name='杭州求职')
            if b'www' not in url:
                url = 'http://www.98pz.com' + str(url)
            try:
                html = requests.get(url=url, headers=self.headers)
                sleep(0.5)
            except Exception as e:
                print(e)
                continue
            res = etree.HTML(html.text)
            try:
                company_name = res.xpath(
                    '//span[@class="firm-name"]/a/@title')[0]
            except:
                continue
            # try:
            #     info = res.xpath('//li/i[contains(text(),"注册情况:")]/following-sibling::*/text()')[0]
            #     print(info)
            # except:
            #     continue

            contact_people = res.xpath(
                '//li/i[contains(text(),"联 系 人:")]/following-sibling::*/text()'
            )[0]
            print(contact_people)
            try:
                phone_url = res.xpath(
                    '//li/i[contains(text(),"固定电话:")]/following-sibling::*//img/@src'
                )[0]
            except:
                try:
                    phone_url = res.xpath(
                        '//li/i[contains(text(),"手机号码:")]/following-sibling::*//img/@src'
                    )[0]
                except:
                    continue

            resourceMark = res.xpath(
                '//li/i[contains(text(),"职位类型:")]/following-sibling::a//text()'
            )
            resourceMark = resourceMark[0] + resourceMark[1]
            if phone_url == '':
                phone = ''
            else:
                try:
                    phone = self.rec_img(phone_url)
                except:
                    continue

            item['companyCity'] = '杭州'
            item['companyProvince'] = '浙江省'
            item['code'] = 'BUS_YT_ZZ'
            item['name'] = '资质'
            item['busCode'] = ''
            item['webUrl'] = '无'
            item['orgId'] = ''
            item['deptId'] = ''
            item['centreId'] = ''
            item["companyName"] = company_name
            item["outName"] = contact_people
            item["resourceRemark"] = resourceMark
            item["companyTel"] = phone
            item["ibossNum"] = None
            item['isDir'] = 0
            item['isShare'] = 0
            item["_id"] = md5encryption(item["companyTel"])
            print(item)
            self.m.mongo_add(item)

    def rec_img(self, img_url):

        url_b = img_url.split('data:image/gif;base64,')[1]
        url_b = url_b.encode()
        content = base64.b64decode(url_b)
        with open(r'G:\rec_pic\target.jpg', 'wb') as f:
            f.write(content)

        text = pytesseract.image_to_string(
            Image.open(r'G:\rec_pic\target.jpg').convert('RGB'))

        os.remove(r'G:\rec_pic\target.jpg')
        return text

    def test(self):
        self.parse_item_url()

    def run(self):
        self.parse_next_page()
        self.parse_item_url()

        self.parse_data()
Exemplo n.º 15
0
class Logospider:
    def __init__(self):
        self.statrurl = 'https://www.logoids.com/tags/diqu/1/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
        self.data_demo = {'_id': '', 'category_name': '', 'brand_name': '', 'logo_url': '', }
        self.m = Mongoclient()
        # self.f=FETCH()
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.category_list = []

    def get_html(self, url):
        html = requests.get(url=url, headers=self.headers).content.decode('utf-8', 'ignore')
        # html=self.f.fetch(url)
        return html

    def parse_category_html(self, html):
        # 解析种类名和种类url

        response = etree.HTML(html)

        # //div[@class="guider"]//dl[1]//dd//ul//li//span/text() 种类名
        # //div[@class="guider"]//dl[1]//dd//ul//li//a/@href 种类url
        category_list = response.xpath('//div[@class="guider"]//dl[1]//dd//ul//li')[1:]
        for category in category_list:
            category_name = category.xpath('.//span/text()')
            category_url = category.xpath('.//a/@href')
            # 使用redis 以字符串形式储存 name 为种类名,值为url,name对url一对字符串
            self.r0.save_category_url(category_name[0], category_url[0])
            # print(self.r0.get_data(category_name[0]))
            # b'https://www.logoids.com/tags/dianqipinpai/'
            self.category_list.append(category_name[0])

    def parse_allpage(self):
        # 解析所有页数的url
        for categoryname in self.category_list:
            # 根据self.category_list从redis获取种类url

            first_page_url = self.r0.get_category_url(categoryname)
            self.r0.del_r0_item(categoryname)
            html = self.get_html(url=first_page_url)
            response = etree.HTML(html)
            self.r1.save_page_url(category_name=categoryname, page_url=first_page_url)
            while True:
                # 下一页url
                try:
                    next_page_url = \
                        response.xpath('//div[@class="pager"]/span[@class="current"]//following-sibling::a[1]/@href')[0]
                    if next_page_url:
                        # print(next_page_url)
                        self.r1.save_page_url(category_name=categoryname, page_url=next_page_url)
                        # name对全部页url,一对列表
                        html = self.get_html(url=next_page_url)
                        response = etree.HTML(html)
                    else:
                        break
                except:
                    break

    def parse_one(self):
        # 解析每一个logo的url
        for category in self.category_list:
            while True:
                try:
                    # 以种类名取出一个页的url如果还有就继续取
                    one_page_url = self.r1.get_page_url(category_name=category)
                    if one_page_url:
                        response = etree.HTML(self.get_html(one_page_url.decode()))
                        # 解析出每一页的所有item的url,列表
                        data_url_list = response.xpath('//ul[@class="list clean"]//li//div[@class="thumb"]/a/@href')
                        for one_data_url in data_url_list:
                            # 以种类为名,所有item url为列表,一对列表存储
                            self.r2.save_item_url(category_name=category, url=one_data_url)
                    else:
                        break
                except:
                    break

    def parse_data(self):
        # 解析最终数据
        for category in self.category_list:
            while True:
                try:
                    url = self.r2.get_item_url(category_name=category)
                    if url:
                        response = etree.HTML(self.get_html(url.decode()))
                        id = id_encrypte(url)
                        logocategory = category
                        logoname = response.xpath('//ul[@class="info"]//li/h1/text()')
                        logourl = response.xpath('//ul[@class="thumb-list"]//li//a/@href')

                        self.m.save_data(
                            {"_id": id, "logocategory": logocategory, "logoname": logoname[0], "logourl": logourl[0],
                             "request_url": url.decode()})
                    else:
                        break
                except Exception as e:
                    print('错误 :%s' % e)
                    break

    def run(self):
        html = self.get_html(self.statrurl)
        self.parse_category_html(html)
        self.parse_allpage()
        self.parse_one()
        self.parse_data()
Exemplo n.º 16
0
class kd8_spider:
    def __init__(self):
        self.starturl = 'http://hangzhou.qd8.com.cn/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
        self.s = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao")
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.item_dict = {}
        self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8')

    def get_category(self):
        html = requests.get(url=self.starturl, headers=self.headers)
        res = etree.HTML(html.text)
        category_list = res.xpath('//div[@class="nav"]//a')[4:15]  # 除去首页,房产招聘求职
        for i in category_list:
            category_url = i.xpath('./@href')[0]
            category_url = self.starturl + category_url[1:]
            self.r0.save_page_url('一级url',category_url)

    def get_sec_category(self):
        while True:
            try:
                category_url = self.r0.get_page_url('一级url')
            except:
                break
            if category_url:
                html = requests.get(url=category_url, headers=self.headers)
                res = etree.HTML(html.text)
                sec_category_list = res.xpath('//div[@class="jzlianjie"]//a')
                for i in sec_category_list:
                    sec_category_url = i.xpath('./@href')[0]
                    sec_category_url = self.starturl + sec_category_url[1:]
                    if sec_category_url == '':
                        break
                    try:
                        self.r1.save_page_url("二级url", sec_category_url)
                    except:
                        break
            else:
                break
    def get_all_page(self):
        while True:
            try:
                print(1)
                url = self.r1.get_page_url("二级url")
                print(url)
            except:
                break
            self.r2.save_page_url("三级url", url)
            try:
                html = requests.get(url=url, headers=self.headers)
            except:
                break
            res = etree.HTML(html.text)
            while True:
                try:
                    next_page_url = res.xpath('//div[@class="paginator"]//a[contains(text(),"下一页")]/@href')[0]
                except Exception as e:
                    print(e)
                    break
                next_page_url = 'http://hangzhou.qd8.com.cn/' + next_page_url[1:]
                url = next_page_url
                self.r2.save_page_url("三级url", url)
                html = requests.get(url=url, headers=self.headers)
                res = etree.HTML(html.text)

    def get_item_url(self):
        while True:
            try:
                url = self.r2.get_page_url("三级url").decode()
                print(url)
                html = requests.get(url=url, headers=self.headers)
            except:
                break
            res = etree.HTML(html.text)
            item_url_list = res.xpath('//table//tbody//tr//td//h2//a[1]/@href')
            print(item_url_list)
            for item_url in item_url_list:
                print(item_url)
                self.r3.save_page_url('每个信息url', item_url)

    # def parse_data(self):
    #     for k in self.item_dict:
    #         for one_url in self.item_dict[k]:
    #             html = requests.get(url=one_url, headers=self.headers)
    #             res = etree.HTML(html)
    #             item_name = res.xpath('//div[@id="baselist"]//li')[0]
    #             # item_info = res.xpath('//div[@id="fangwu_view_contnet"]//text()')
    #             item_phone = res.xpath('//div[@id="yzlist"]//li[2]/text()')
    #             item = {"item_name": item_name, "item_phone": item_phone}
    #             # self.db.mongo_add(item)
    #             print(item)
    #             # //div[@id="baselist"]//li
    #             # //div[@id="yzlist"]//li

    def huangye(self):
        pass

    def run(self):
        # self.get_category()
        # self.get_sec_category()
        # self.get_all_page()
        self.get_item_url()