Пример #1
0
    def process_start_built_url(self):
        url = 'https://www.to8to.com/index.html'

        while 1:
            try:
                response = requests.get(url, headers={
                    'user-agent': random_useragent()
                }, proxies=AbuyunSpider.returnRequestProxies())
                response.encoding = response.apparent_encoding
                if response.status_code == 200:
                    break
            except Exception as e:
                print(e)
                time.sleep(random.randint(2, 5))

        # with open('ddddd.html','w') as f:
        #     f.write(response.text)
        document = pq(response.text)
        res = []
        for x in document('div[@class="xzcs_dt"] > a').items():
            # print(x)
            item = {}
            pattern = re.compile('//(.*?)\.')
            item['city'] = x.text()
            item['city_num'] = re.search(pattern,x.attr('href')).group(1)

            res.append(item)
        return res
Пример #2
0
 def process_requesst(self,session,url):
     while 1:
         try:
             response = session.get(url, headers={'user-agent': random_useragent()},
                                    proxies=AbuyunSpider.returnRequestProxies(),timeout=6)
             if response.status_code == 200:
                 break
         except Exception as e:
             print(e)
             time.sleep(random.randint(2, 5))
     return session, response
Пример #3
0
 def get_cityid(self,url):
     while 1:
         print('###############获取城市ID{}'.format(url))
         try:
             response = requests.get(url, headers={
                 'user-agent': random_useragent()
             }, proxies=AbuyunSpider.returnRequestProxies())
             if response.status_code == 200:
                 break
         except Exception as e:
             print(e)
             time.sleep(random.randint(2,5))
     document = pq(response.text)
     cityID = document('#cityId').attr('value')
     return cityID
Пример #4
0
 def reutnRequestsProxies(self):
     """
     :return: 返回requests的proxies
     """
     return AbuyunSpider.returnRequestProxies()
Пример #5
0
class chenYangSpider(object):

    start_urls = "http://www.chenyang.com/index.php?m=content&c=index&a=lists&catid=27&p={p}&city={city}"
    next_urls = "http://www.chenyang.com/index.php?m=content&c=index&a=lists&catid=27&p={p}&city={city}&page={page}"
    built_url = "http://www.chenyang.com/"
    returnRequestsProxies = AbuyunSpider.returnRequestProxies()

    def __init__(self, setting=Setting()):
        self.db_name = setting.mysql_table_jinpin_dbname

    def returnBuiltHeaders(self, path, RefererUrl=None):
        """
        构造headers
        :return:
        """
        headers = {
            "Accept":
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh - CN, zh;q = 0.9",
            "Cache-Control": "max - age = 0",
            "Connection": "keep - alive",
            "Host": "www.chenyang.com",
            "Upgrade-Insecure-Requests": "1",
            "user-agent": random_useragent(),
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl

        return headers

    def start_requests(self):

        citys = address_json.address
        for i in citys:
            pro = i['p']
            # print(i)
            p = parse.quote(i['p'])
            for c in i['city']:
                cit = c
                # if '北京' in i['p']:
                c = parse.quote(c)
                page = 1
                url = self.start_urls.format(p=p, city=c)
                response = self.process_request(nextPage=url)
                if response:
                    self.parse(response,
                               meta={
                                   'page': 1,
                                   'p': p,
                                   "c": c,
                                   'pro': pro,
                                   'cit': cit
                               })

    # todo:处理requests 请求URL
    def process_request(self, nextPage, meta=None, Referer=None):
        path_params = '/' + '/'.join(nextPage.split('/')[-3:])
        count = 0
        while 1:
            try:
                response = requests.get(url=nextPage,
                                        headers=self.returnBuiltHeaders(
                                            path=path_params,
                                            RefererUrl=Referer),
                                        timeout=3,
                                        allow_redirects=False,
                                        proxies=self.returnRequestsProxies)
                if response.status_code == 200:
                    print('#########解析成功URL: {}########'.format(response.url))
                    return response
                else:
                    print(nextPage)
                    print(response)
                    if count > 20:
                        return False
                    count += 1
            except Exception as e:
                print(e)
                print(nextPage)
                time.sleep(random.randint(2, 5))

    def parse(self, response, meta):
        # with open('chengyang.html','w') as f:
        #     f.write(response.text)
        document_pq = pq(response.text)
        value = document_pq('.list-zmd > li')
        if value:
            pro = meta['pro']
            city = meta['cit']
            io_data = {self.db_name: []}
            for i in value.items():
                name = i('div').eq(0)('p > strong').text()
                address = i('div').eq(1)('p').text()
                item = {}
                item['name'] = name
                item['address'] = address
                item['province'] = pro
                item['city'] = city
                item['area'] = ""
                item['numbers'] = ""
                item['telphone'] = ""
                item['types'] = 3
                print(item)
                io_data[self.db_name].append(item)
            server = Pipline_to_redis_server()
            server.sadd(io_data)
            if document_pq('.pages > li'):
                len_num = len(document_pq('.pages > li'))
                ressult_li = document_pq('.pages > li').eq(len_num -
                                                           1)('a').attr('href')
                pattern = re.compile('page=(.*?)&')
                page_li = re.search(pattern, ressult_li).group()
                try:
                    page_url = re.search(pattern, response.url).group()
                except Exception as e:
                    page_url = ''
                if page_li == page_url:
                    ressult_li = ''
                page = meta['page']
                page += 1
                c = meta['c']
                p = meta['p']
                if ressult_li:
                    url = self.built_url + ressult_li
                    response = self.process_request(nextPage=url)
                    if response:
                        self.parse(response=response,
                                   meta={
                                       'page': page,
                                       'p': p,
                                       "c": c,
                                       'pro': pro,
                                       'cit': city
                                   })
Пример #6
0
class DuoleshiSpider(object):
    name = 'duoleshi'
    allowed_domains = ['www.dulux.com.cn']
    # start_urls = "https://www.dulux.com.cn/find/store-ajax?address={address}&attributes=&language=zh&pagenum={page}"
    start_urls = "https://www.dulux.com.cn/ajax/stores-api/select/all-id?flds=id,latitude,longitude,companyName,companyName_zh,address,address_zh,city,city_zh,zipcode,zipcode_zh,attributeCodes,brands,region,region_zh,phone,phone_zh,district,district_zh,country,countryCode_zh,country_zh"
    returnRequestsProxies = AbuyunSpider.returnRequestProxies()

    def __init__(self,setting=Setting()):
        self.db_name = setting.mysql_table_jinpin_dbname

    def returnBuiltHeaders(self, path, RefererUrl=None):
        """
        构造headers
        :return:
        """
        headers = {
            "accept": "application/json, text/plain, */*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "if-none-match": "1591637021-1",
            "referer": "https://www.dulux.com.cn/zh/find-a-stockist",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": random_useragent(),
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl

        return headers

    # todo:处理requests 请求URL
    def process_request(self, nextPage, meta=None, Referer=None):
        path_params = '/' + '/'.join(nextPage.split('/')[-3:])
        count = 0
        while 1:
            try:
                response = requests.get(url=nextPage,
                                        headers=self.returnBuiltHeaders(path=path_params),
                                        timeout=3, allow_redirects=False, proxies=self.returnRequestsProxies)
                if response.status_code == 200:
                    print('#########解析成功URL: {}########'.format(response.url))
                    return response
                else:
                    print(nextPage)
                    print(response)
                    if count > 20:
                        return False
                    count += 1
            except Exception as e:
                print(e)
                print(nextPage)
                time.sleep(random.randint(2, 5))

    def start_requests(self):
        url = self.start_urls
        response = self.process_request(nextPage=url)
        if response:
            self.parse(response)
        # yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        result = json.loads(response.text)['response']['docs']
        print('####################爬取多乐士网站数据:{}条##########'.format(len(result)))
        import re
        if result:
            io_data = {self.db_name: []}
            for index,i in enumerate(result):
                # print(index)
                try:
                    name = i['companyName_zh']
                    pattern_params = re.compile('(.*?){}|(\s)+省'.format(i['city_zh']))
                    try:
                        region = re.search(pattern_params,i['address_zh']).group(1) #  省份
                    except Exception as e:
                        print(e)
                        region = ''
                    city = i['city_zh']  # 城市
                    subtitle = ''  # 授权号
                    address = i['address_zh']
                    phone = i.get('phone_zh','')
                    item = {}
                    item['name'] = name
                    item['address'] = address
                    item['province'] = region
                    item['city'] = city
                    item['area'] = ""
                    item['numbers'] = subtitle
                    item['telphone'] = phone
                    item['types'] = 2
                    io_data[self.db_name].append(item)
                except Exception as e:
                    print(e)

            server = Pipline_to_redis_server()
            server.sadd(io_data)
Пример #7
0
class HuarunqiSpider(object):
    name = 'huarunqi'
    allowed_domains = ['www.huarun.com']
    start_urls = "http://www.huarun.com/service/store/search?province={}&city=&area=&address=&design=off&page=1&limit=10000"
    returnRequestsProxies = AbuyunSpider.returnRequestProxies()

    def __init__(self, setting=Setting()):
        self.db_name = setting.mysql_table_jinpin_dbname

    def returnBuiltHeaders(self, path, RefererUrl=None):
        """
        构造headers
        :return:
        """
        headers = {
            "Accept":
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh - CN, zh;q = 0.9",
            "Cache-Control": "max - age = 0",
            "Connection": "keep - alive",
            "Host": "www.chenyang.com",
            "Upgrade-Insecure-Requests": "1",
            "user-agent": random_useragent(),
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl

        return headers

    # todo:处理requests 请求URL
    def process_request(self, nextPage, meta=None, Referer=None):
        path_params = '/' + '/'.join(nextPage.split('/')[-3:])
        count = 0
        while 1:
            try:
                response = requests.get(url=nextPage,
                                        headers=self.returnBuiltHeaders(
                                            path=path_params,
                                            RefererUrl=Referer),
                                        timeout=3,
                                        allow_redirects=False,
                                        proxies=self.returnRequestsProxies)
                if response.status_code == 200:
                    print('#########解析成功URL: {}########'.format(response.url))
                    return response
                else:
                    print(nextPage)
                    print(response)
                    if count > 20:
                        return False
                    count += 1
            except Exception as e:
                print(e)
                print(nextPage)
                time.sleep(random.randint(2, 5))

    def start_requests(self):
        citys = address_json.address
        for i in citys:
            cit = parse.quote(i['p'])
            url = self.start_urls.format(cit)
            response = self.process_request(nextPage=url)
            if response:
                self.parse(response)
            # yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        result = json.loads(response.text)
        if 'datas' in result['result']:
            io_data = {self.db_name: []}
            for j in result['result']['datas']:
                name = j['name']
                address = j['address']
                province = j['province']
                city = j['city']
                area = j['area']
                numbers = j['numbers']
                telphone = j['telphone']
                item = {}
                item['name'] = name
                item['address'] = address
                item['province'] = province
                item['city'] = city
                item['area'] = area
                item['numbers'] = numbers
                item['telphone'] = telphone
                item['types'] = 1
                io_data[self.db_name].append(item)
            print('############开始存入redis#{}##############'.format(io_data))
            # yield io_data
            print(io_data)
            server = Pipline_to_redis_server()
            server.sadd(io_data)
        else:
            print('########################没有结果########', result)