Exemplo n.º 1
0
    def process_start_built_url(self):
        url = 'https://www.to8to.com/index.html'

        while 1:
            try:
                response = requests.get(url, headers={
                    'user-agent': random_useragent()
                }, proxies=AbuyunSpider.returnRequestProxies())
                response.encoding = response.apparent_encoding
                if response.status_code == 200:
                    break
            except Exception as e:
                print(e)
                time.sleep(random.randint(2, 5))

        # with open('ddddd.html','w') as f:
        #     f.write(response.text)
        document = pq(response.text)
        res = []
        for x in document('div[@class="xzcs_dt"] > a').items():
            # print(x)
            item = {}
            pattern = re.compile('//(.*?)\.')
            item['city'] = x.text()
            item['city_num'] = re.search(pattern,x.attr('href')).group(1)

            res.append(item)
        return res
Exemplo n.º 2
0
 def process_requesst(self,session,url):
     while 1:
         try:
             response = session.get(url, headers={'user-agent': random_useragent()},
                                    proxies=AbuyunSpider.returnRequestProxies(),timeout=6)
             if response.status_code == 200:
                 break
         except Exception as e:
             print(e)
             time.sleep(random.randint(2, 5))
     return session, response
Exemplo n.º 3
0
    def process_request(self, nextPage, meta, item, Referer=None):
        path_params = '/' + '/'.join(nextPage.split('/')[-3:])
        """
        header
        """

        headers = self.returnBuiltHeaders(path=path_params,
                                          RefererUrl=Referer,
                                          item=item)
        meta['firstUrl'] = headers['sourceUrl']
        del headers['sourceUrl']
        while 1:
            try:
                first_url = nextPage[:-1]
                first_res = requests.get(
                    first_url, headers={'user-agent': random_useragent()})
                with open('ss1.html', 'w') as f:
                    # print(first_res.text)
                    f.write(first_res.text)
                second_res = nextPage.replace('https', 'http')
                second_res = requests.get(
                    second_res, headers={'user-agent': random_useragent()})
                with open('ss2.html', 'w') as f:
                    # print(second_res.text)
                    f.write(second_res.text)
                three_yrl = nextPage
                response = requests.get(url=three_yrl,
                                        headers=headers,
                                        timeout=3,
                                        allow_redirects=False,
                                        proxies=self.reutnRequestsProxies())
                if response.status_code == 200:
                    return response, meta
                else:
                    print(response)
            except Exception as e:
                print(e)
                time.sleep(random.randint(2, 5))
Exemplo n.º 4
0
 def get_cityid(self,url):
     while 1:
         print('###############获取城市ID{}'.format(url))
         try:
             response = requests.get(url, headers={
                 'user-agent': random_useragent()
             }, proxies=AbuyunSpider.returnRequestProxies())
             if response.status_code == 200:
                 break
         except Exception as e:
             print(e)
             time.sleep(random.randint(2,5))
     document = pq(response.text)
     cityID = document('#cityId').attr('value')
     return cityID
Exemplo n.º 5
0
    def returnBuiltHeaders(self, path, RefererUrl=None):
        """
        构造headers
        :return:
        """
        headers = {
            "Accept":
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh - CN, zh;q = 0.9",
            "Cache-Control": "max - age = 0",
            "Connection": "keep - alive",
            "Host": "www.chenyang.com",
            "Upgrade-Insecure-Requests": "1",
            "user-agent": random_useragent(),
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl

        return headers
Exemplo n.º 6
0
    def returnBuiltHeaders(self, path, RefererUrl=None):
        """
        构造headers
        :return:
        """
        headers = {
            "accept": "application/json, text/plain, */*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "if-none-match": "1591637021-1",
            "referer": "https://www.dulux.com.cn/zh/find-a-stockist",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": random_useragent(),
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl

        return headers
Exemplo n.º 7
0
    def returnBuiltHeaders(self, path, item, RefererUrl=None, page=None):
        """
        构造headers

        :return:
        """
        city = parse.quote(item['city'])
        city_num = item['city_num']
        city_type = item['city_type']

        sourceUrl_built_url = 'https%3A%2F%2F{}.to8to.com%2Fcompany%2F'.format(
            city_num)
        firstUrl_built_url = 'https://{city_num}.to8to.com/company/{city_type}/'.format(
            city_num=city_num, city_type=city_type)
        sourceUrl = item['sourceUrl'].replace('://', '%3A%2F%2F').replace(
            '/', "%2F") if item.get('sourceUrl') else sourceUrl_built_url
        firstUrl = item.get('firstUrl').replace('://', '%3A%2F%2F').replace(
            '/',
            "%2F") if item.get('firstUrl') else firstUrl_built_url.replace(
                ':/', '%3A%2F%2F').replace('/', "%2F")
        nowpage = item.get('firstUrl').replace(
            '://', '%253A%252F%252F').replace('/', '%252F') if item.get(
                'firstUrl') else firstUrl_built_url.replace(
                    ':/', '%253A%252F%252F').replace('/', "%252F")
        if not page:
            landpage = 'https%3A//sz.to8to.com/'
        else:
            landpage = firstUrl
        headers = {
            # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            # "accept-encoding": "gzip, deflate, br",
            # "accept-language": "zh-CN,zh;q=0.9",
            # "cache-control": "no-cache",
            # "pragma": "no-cache",
            # "sec-fetch-dest": "document",
            # "sec-fetch-mode": "navigate",
            # "sec-fetch-site": "cross-site",
            # "sec-fetch-user": "******",
            # "upgrade-insecure-requests": '1',
            "user-agent":
            random_useragent(),
            "cookie":
            "uid=CgoKUF61XAWCtJc0A7vvAg==; "
            "to8tocookieid=f982c677f999237b9fe9e5ee3947f2cc806225; "
            "tracker2019session=%7B%22session%22%3A%2217201522ce2109-0c32fb225c0495-14291003-2073600-17201522ce3201%22%7D; "
            "tracker2019jssdkcross=%7B%22distinct_id%22%3A%2217201522ce612f-03e21b7f111ee1-14291003-2073600-17201522ce71e%22%7D; "
            "to8to_sourcepage=; to8to_landtime=1589160062; "
            "to8to_cook=OkOcClPzRWV8ZFJlCIF4Ag==; "
            "to8to_townid=1103; to8to_tcode=sh; "
            "to8to_tname=%E4%B8%8A%E6%B5%B7; "
            "Hm_lvt_dbdd94468cf0ef471455c47f380f58d2=1589160063; "
            "tender_popup_flag=true;"
            " ONEAPM_BI_sessionid=9238.924|1589197648127; "
            "Hm_lpvt_dbdd94468cf0ef471455c47f380f58d2={times}; act=freshen;"
            "to8to_landpage={landpage}; "
            "to8to_tcode={city_num}; to8to_tname={city}; "
            "to8to_cmp_sourceUrl={sourceUrl}; "
            "to8to_cmp_firstUrl={firstUrl}; "
            "to8to_nowpage={nowpage}; ".format(city=city,
                                               city_num=item['city_num'],
                                               sourceUrl=sourceUrl,
                                               firstUrl=firstUrl,
                                               nowpage=nowpage,
                                               landpage=landpage,
                                               times=now_to_timestamp())
        }
        if RefererUrl:
            headers['Referer'] = RefererUrl
        headers['sourceUrl'] = sourceUrl
        return headers