Пример #1
0
def get_real_url(url, try_count=1):
    if try_count > 3:
        return url
    try:
        rs = requests.get(url, headers=get_header(), timeout=10)
        if rs.status_code > 400 and get_domin_url(rs.url) == 'www.offers.com':
            return get_real_url(url, try_count + 1)
        else:
            return rs.url
    except Exception as e:
        print(e)
        return get_real_url(url, try_count + 1)
Пример #2
0
def get_real_url(url, try_count=1):
    if try_count > 3:
        return url
    try:
        rs = requests.get(url, headers=get_header(), timeout=10, verify=False)
        if rs.status_code > 400 and get_domain_url(rs.url) == 'www.offers.com':
            return get_real_url(url, try_count + 1)
        if get_domain_url(rs.url) == get_domain_url(url):
            target_url = re.findall(r'replace\(\'(.+?)\'', rs.content.decode())
            if target_url:
                return target_url[0].replace('\\', '') if re.match(r'http', target_url[0]) else rs.url
            else:
                return rs.url
        else:
            return get_real_url(rs.url)
    except Exception as e:
        print(e)
        return get_real_url(url, try_count + 1)
Пример #3
0
 def store_page_parse(self, response):
     html = response.body
     soup = BeautifulSoup(html, 'lxml')
     store_item = StoreItem()
     # 处理字段定位
     # store
     store_item['type'] = 'store'
     store_item['logo_url'] = 'https:' + soup.find(
         'div', id='company-identity').a.img.get('src')
     store_item['title'] = soup.find(
         'div', id='offer-section').find('strong').text.strip()
     store_item['name'] = store_item['title']
     store_item['site'] = 'offers'
     store_item['url_name'] = response.url.split('/')[-2]
     store_item['description'] = soup.find(
         'div', id='company-information').find('p').text
     store_item['category'] = soup.find_all(
         'a', itemprop='item')[-1].find('span').text
     store_item['website'] = get_real_url(
         self.base_url +
         soup.find('div', id='company-identity').a.get('href'))
     store_item['country'] = "US"
     store_item['picture'] = scrapy.Field()
     store_item['coupon_count'] = soup.find(
         'div', id='merchant-stats').find('tr').find('span').text
     store_item['created_at'] = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     store_item['final_website'] = get_domin_url(store_item['website'])
     if store_item['final_website'] == '' or store_item[
             'final_website'] is None or store_item[
                 'final_website'] == '#' or store_item[
                     'final_website'] == 'https://www.offers.com':
         print(store_item['final_website'])
     # coupon
     for offer in soup.find_all('div', class_='offerstrip'):
         if 'expired' in offer.parent.get('class'):
             continue
         coupon_item = CouponItem()
         coupon_item['type'] = 'coupon'
         coupon_item['name'] = offer.find('h3', class_='name').text.strip()
         coupon_item['site'] = 'offers'
         description = offer.find('div', class_='more-details')
         coupon_item['description'] = description.find(
             'p').text.strip() if description else ""
         try:
             coupon_item['verify'] = 'Y' if offer.find(
                 'span', class_='verified').find(
                     'strong').text == "Verified" else "N"
         except:
             coupon_item['verify'] = 'N'
         coupon_item['link'] = self.base_url + offer.find('a').get('href')
         coupon_item['expire_at'] = None
         try:
             div = offer.find('div', class_='badge-text')
             span = offer.find('span', class_='dolphin flag')
             coupon_type = div.text if div else ''
             coupon_type += span.text if span else ''
         except:
             coupon_item['coupon_type'] = "DEAL"
         if 'code' in coupon_type:
             data_offer_id = offer.get('data-offer-id')
             long_id = coupon_item['link'].split('/')[-2]
             code_get_url = self.code_url.replace('code_id',
                                                  data_offer_id).replace(
                                                      'long_id', long_id)
             res = requests.get(code_get_url, headers=get_header())
             code = re.findall(r'<div class="coupon-code">(.+?)</div>',
                               res.content.decode())
             coupon_item['code'] = code[0] if code else ''
             coupon_item['coupon_type'] = "CODE"
         else:
             coupon_item['coupon_type'] = "DEAL"
             coupon_item['code'] = ''
         coupon_item['final_website'] = store_item['final_website']
         coupon_item['store'] = store_item['title']
         coupon_item['store_url_name'] = store_item['url_name']
         coupon_item['store_description'] = store_item['description']
         coupon_item['store_category'] = store_item['category']
         coupon_item['store_website'] = store_item['website']
         coupon_item['store_country'] = "US"
         coupon_item['store_picture'] = store_item['logo_url']
         coupon_item['created_at'] = datetime.datetime.now().strftime(
             '%Y-%m-%d %H:%M:%S')
         coupon_item['status'] = '0'
         # coupon_item['depth'] = scrapy.Field()
         # coupon_item['download_timeout'] = scrapy.Field()
         # coupon_item['download_slot'] = scrapy.Field()
         # coupon_item['download_latency'] = scrapy.Field()
         yield coupon_item
     yield store_item
     pass