Exemplo n.º 1
0
class Scrapy78Pipeline:
    def open_spider(self, spider):
        # 爬虫开始时候执行
        # spider.hello = "world"  # 为spider对象动态添加属性,可以在spider模块中获取该属性值
        # 可以开启数据库等
        self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha")

    def process_item(self, item, spider):
        i = {}
        i['companyCity'] = "长沙"
        i['companyProvince'] = "湖南省"
        i['code'] = 'BUS_YT_ZZ'
        i['name'] = '资质'
        i['busCode'] = ''
        i['webUrl'] = '无'
        i['orgId'] = ''
        i['deptId'] = ''
        i['centreId'] = ''
        i["companyName"] = item["companyName"]
        i["outName"] = item["outName"]
        i["resourceRemark"] = item['resourceRemark']
        i["companyTel"] = str(item["companyTel"])
        i["ibossNum"] = None
        i['isDir'] = 0
        i['isShare'] = 0
        i['flag'] = 0
        i["_id"] = md5encryption(item["companyTel"])
        self.Mongo.mongo_add(i)
        print(i)
        return item
Exemplo n.º 2
0
class Guakao555Pipeline(object):
    def open_spider(self,spider):
        #爬虫开始时候执行
        # spider.hello = "world"  # 为spider对象动态添加属性,可以在spider模块中获取该属性值
        # 可以开启数据库等
        self.Mongo=MongoDB('mongodb://localhost', 'cuiworkdb', "555guakao_dg")

    def process_item(self, item, spider):
        item['companyCity']= '杭州'
        item['code']= 'BUS_YT_ZZ'
        item['name']= '资质'
        item['busCode']= ''
        item['webUrl']='无'
        item['orgId']=''
        item['deptId']= ''
        item['centreId']= ''
        item["ibossNum"]=None
        item['isDir']= 0
        item['isShare']= 0
        item["_id"]= md5encryption(item["companyTel"])
        self.Mongo.mongo_add(item)
        return item

    def close_spider(self,spider):
        #爬虫结束时执行
        #可以关闭数据库
        pass
Exemplo n.º 3
0
#从增量库里面导出flag=0的数据到clues表进行接口导入
from Func.client import MongoDB
# 导入接口(等号后接表名,数据库为)  https://dqk.dgg188.cn/api/import/import_data?ip=10.2.1.122:17017&docName=
import requests

m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_hubei')
m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20210129-zhijiazhuang")
all_data = m.find_many("flag", 0)
for one in all_data:
    m2.mongo_add(one)
m2.del_field()

#导入接口
# dbname='jianzhutong_guangzhou'
# data={"ip": "10.2.1.122:17017","docName":dbname}
# headers={
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Cache-Control": "no-cache",
# "Connection": "keep-alive",
# "Cookie": "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%7D",
# "Host": "dqk.dgg188.cn",
# "Pragma": "no-cache",
# "Sec-Fetch-Dest": "document",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-User":"******",
# "Upgrade-Insecure-Requests":"1",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# }
Exemplo n.º 4
0
class Gkspider:
    def __init__(self):
        self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb")
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.category_name_list = []
        self.sec_category_dict = {}
        self.headers_forpage = {
            "Host": "www.80guakao.com",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "User-Agent":
            "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36",
            "Accept": "*/*",
            "Referer": "http://www.80guakao.com/shengfen/hb/",
            "Accept-Encoding": "gzip,deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cookie": "",
        }

    def get_category(self):
        html = self.f.fetch(url=self.starturl,
                            headers=self.headers,
                            method='get')
        # html = requests.get(url=self.starturl, headers=self.headers)
        sleep(random.randint(0, 1))
        res = etree.HTML(html.text)
        # print(html.text)

        # category_url_list = res.xpath('//div[@class="content"]//div//a')
        # # if len(category_url_list) > 19:
        # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a')
        category_url_list = res.xpath(
            '//div[@class="categories"]//ul//li[1]//dd[1]//a')
        for i in category_url_list:
            category_name = i.xpath('./text()')[0]
            category_url = i.xpath('./@href')[0]
            category_url = category_url.replace('m.', 'www.')
            if category_name != "不限":
                self.r0.save_category_url(category_name, category_url)
                self.category_name_list.append(category_name)

    def get_sec_category(self):
        for category_name in self.category_name_list:

            url = self.r0.get_category_url(category_name)
            # html = self.f.fetch(url=url,headers=self.headers,method='get')
            html = requests.get(url=url, headers=self.headers_forpage)
            sleep(random.randint(0, 1))
            res = etree.HTML(html.text)

            sec_category_list = res.xpath('//div[@class="content"]//div//a')
            # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a')

            for i in sec_category_list:
                sec_category_name = i.xpath('./text()')[0]
                sec_category_url = i.xpath('./@href')[0]
                sec_category_url = sec_category_url.replace('m.', 'www.')
                if sec_category_name != '不限':
                    print(sec_category_name)
                    self.r1.save_one_dict(category_name, sec_category_name,
                                          sec_category_url)

    def get_all_page(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)

            for sec_category_name, url in sec_category_list.items():
                # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get')
                html = requests.get(url=url.decode(),
                                    headers=self.headers_forpage)
                sleep(random.randint(0, 1))
                res = etree.HTML(html.text)
                self.r2.save_page_url(
                    category + ":" + sec_category_name.decode(), url.decode())
                while True:
                    try:
                        next_page = res.xpath(
                            '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'
                        )[0]
                    except:
                        break
                    if not next_page:
                        break

                    self.r2.save_page_url(
                        category + ":" + sec_category_name.decode(), next_page)
                    html_next = self.f.fetch(url=next_page,
                                             headers=self.headers_forpage,
                                             method='get')
                    # html_next = requests.get(url=next_page, headers=self.headers_forpage)
                    sleep(random.randint(0, 1))
                    res = etree.HTML(html_next.text)

    def get_item_url(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r2.get_page_url(category + ":" +
                                                   sec_category_name.decode())
                        # html = self.f.fetch(url=url, headers=self.headers,method='get')
                        html = requests.get(url=url,
                                            headers=self.headers_forpage)
                        sleep(random.randint(1, 2))
                        res = etree.HTML(html.text)
                    except Exception as e:
                        print('error:', e)
                        break
                    # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href')
                    item_list = res.xpath(
                        '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'
                    )

                    for item_url in item_list:
                        # if 'tel' not in item_url:
                        #     url = item_url.replace('m.', 'www.') #每个数据url
                        if 'http' not in item_url:
                            item_url = 'http://www.80guakao.com/' + item_url
                        self.r3.save_item_url(
                            category + ':' + sec_category_name.decode(),
                            item_url)

    def get_info(self):
        # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0])  # 姓名
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r3.get_item_url(category + ":" +
                                                   sec_category_name.decode())

                        html = requests.get(url=url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                        if html.status_code != 200:
                            html = self.f.fetch(url=url.decode(),
                                                headers=self.headers_forpage,
                                                method='get')
                            sleep(random.randint(0, 1))
                        res = etree.HTML(html.text)

                    except:
                        break
                    item = {}
                    # try:
                    #     company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0]
                    # except:
                    try:
                        company_name = res.xpath(
                            '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()'
                        )[0]

                    except:
                        company_name = 'None'

                    # try:
                    #     contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0]
                    #     contact_people = contact_people.replace(r'\xa0\xa0','')
                    #
                    # except:
                    contact_people = res.xpath(
                        '//ul[@class="contacter"]//li//font/text()')[0]

                    # try:
                    #     perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0]
                    # except:
                    #
                    #     perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0]
                    #

                    # try:
                    #     phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0]
                    #     if phone == []:
                    #         raise  Exception
                    # except:

                    # try:
                    phone_url_re = res.xpath(
                        '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick'
                    )[0]

                    par = re.compile("'.*?'")
                    phone_url = re.findall(par, phone_url_re)[1].replace(
                        "'", "")  # 电话号码url

                    if type(phone_url) == str:
                        html = requests.get(url=phone_url,
                                            headers=self.headers_forpage)
                    else:
                        html = requests.get(url=phone_url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                    res = etree.HTML(html.text)
                    phone = res.xpath(
                        '//div[@class="number"]//span[@class="num"]/text()')[0]
                    # except:
                    #     phone = "None"

                    item['companyCity'] = '宜昌'
                    item['companyProvince'] = '湖北省'
                    item['code'] = 'BUS_YT_ZZ'
                    item['name'] = '资质'
                    item['busCode'] = ''
                    item['webUrl'] = '无'
                    item['orgId'] = ''
                    # 部门ID 字符串
                    item['deptId'] = ''
                    # 中心ID 字符串
                    item['centreId'] = ''
                    # item["first_category"] = category
                    # item["sec_category"] = sec_category_name.decode()
                    item["companyName"] = company_name
                    item["outName"] = contact_people
                    item[
                        "resourceRemark"] = category + ":" + sec_category_name.decode(
                        )
                    item["companyTel"] = phone.strip()
                    if len(contact_people) == 11:
                        item["companyTel"] = contact_people
                    item["ibossNum"] = None
                    item['isDir'] = 0
                    item['isShare'] = 0
                    item["_id"] = md5encryption(item["companyTel"])
                    print(item)
                    self.m.mongo_add(item)

    def test(self):
        url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html'
        html = requests.get(url=url, headers=self.headers_forpage)
        print(html.text)
        res = etree.HTML(html.text)
        # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'))
        # print(res.xpath('//div[@class="content"]//div//a/text()'))
        # print(html.text)
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href'))
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求
        # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人
        phone_url_re = res.xpath(
            '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0]  #电话号码

        print(phone_url_re)
        par = re.compile("'.*?'")
        phone_url = re.findall(par, phone_url_re)[1].replace("'", "")  #电话号码url
        html = requests.get(url=phone_url, headers=self.headers_forpage)
        res = etree.HTML(html.text)
        phone = res.xpath(
            '//div[@class="number"]//span[@class="num"]/text()')[0]
        print(phone)
        #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc=
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'))

    def run(self):
        self.get_category()
        self.get_sec_category()
        self.get_all_page()
        self.get_item_url()
        self.get_info()
Exemplo n.º 5
0
class JanzhuSpider():
    def __init__(self, start_url, cookie, referer, companyCity,
                 companyProvince, db):
        self.start_url = start_url
        self.companyCity = companyCity
        self.companyProvince = companyProvince
        self.headers = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            cookie,
            # "Cookie": "",
            "pragma":
            "no-cache",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "Referer":
            referer,
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db)
        # self.f = FETCH()
        self.par = re.compile(r'\d+-\d+')
        self.par2 = re.compile(r'\d+')

    def parse_next_page(self):
        self.r0.save_page_url(category_name="北京", page_url=self.start_url)
        # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get')
        html = requests.get(url=self.start_url, headers=self.headers)
        sleep(2)
        while True:
            res = etree.HTML(html.text)
            try:
                next_page = res.xpath(
                    '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href'
                )
                print(next_page)
                next_page = 'https://www.cbi360.net' + next_page[0]
            except Exception as e:
                print(e)
                print(html.text)
                break
            self.r0.save_page_url(category_name="北京", page_url=next_page)
            self.parse_item(res)
            # html = self.f.fetch(url=next_page, headers=self.headers, method='get')
            html = requests.get(url=next_page, headers=self.headers)
            sleep(1)

    def re_phone(self, target):
        try:
            phone = re.findall(self.par, target)[0]
        except:
            print(target)
            try:
                phone = re.findall(self.par2, target)[0]
            except:
                phone = ''
        return phone

    def parse_item(self, res):
        # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]
        # while True:
        # try:
        #     # url = self.r0.get_page_url(category_name='北京')
        #     # html = self.f.fetch(url=url, headers=self.headers, method='get')
        #     # html = requests.get(url=url, headers=self.headers)
        # except:
        #     continue
        sleep(1)
        # res = etree.HTML(html.text)
        companyName_list = res.xpath(
            '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()'
        )
        phone_list = res.xpath(
            '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()'
        )
        for i in range(len(companyName_list)):
            item = {}
            companyName = companyName_list[i]
            phone = self.re_phone(phone_list[i])
            if is_phone(phone):
                item['companyCity'] = self.companyCity
                item['companyProvince'] = self.companyProvince
                item['code'] = 'BUS_YT_ZZ'
                item['name'] = '资质'
                item['busCode'] = ''
                item['webUrl'] = '无'
                item['orgId'] = ''
                item['deptId'] = ''
                item['centreId'] = ''
                item["companyName"] = companyName
                item["outName"] = ''
                item["resourceRemark"] = ''
                item["companyTel"] = phone
                item["ibossNum"] = None
                item['isDir'] = 0
                item['isShare'] = 0
                item["_id"] = md5encryption(item["companyTel"])
                item["flag"] = 0
                print(item)
                self.m.mongo_add(item)

    def run(self):
        self.parse_next_page()
Exemplo n.º 6
0
# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng")

# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4")
# 导入clues,只有这个才能推送数据

# m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0})

# all_data = m1.find_all()
# for i in all_data:
#     print(i)

count = 0
gd_data = m1.find_all()
for i in gd_data:
    # if count <= 3000:
    m2.mongo_add(i)
# elif count <= 6000:
#     m3.mongo_add(i)
# elif count<=9000:
#     m4.mongo_add(i)
# else:
#     m5.mongo_add(i)
# count += 1
#all_data = m3.find_all()

# 相同公司名去重
# list_data = list(m1.find_all())
# for i in range(len(list_data)):
#    for k in range(i+1, len(list_data)):
#        if list_data[i]["companyName"] == list_data[k]["companyName"]:
#            list_data[k]["companyName"] = "None"