def get_real_url(url, try_count=1): if try_count > 3: return url try: rs = requests.get(url, headers=get_header(), timeout=10) if rs.status_code > 400 and get_domin_url(rs.url) == 'www.offers.com': return get_real_url(url, try_count + 1) else: return rs.url except Exception as e: print(e) return get_real_url(url, try_count + 1)
def get_real_url(url, try_count=1): if try_count > 3: return url try: rs = requests.get(url, headers=get_header(), timeout=10, verify=False) if rs.status_code > 400 and get_domain_url(rs.url) == 'www.offers.com': return get_real_url(url, try_count + 1) if get_domain_url(rs.url) == get_domain_url(url): target_url = re.findall(r'replace\(\'(.+?)\'', rs.content.decode()) if target_url: return target_url[0].replace('\\', '') if re.match(r'http', target_url[0]) else rs.url else: return rs.url else: return get_real_url(rs.url) except Exception as e: print(e) return get_real_url(url, try_count + 1)
def store_page_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') store_item = StoreItem() # 处理字段定位 # store store_item['type'] = 'store' store_item['logo_url'] = 'https:' + soup.find( 'div', id='company-identity').a.img.get('src') store_item['title'] = soup.find( 'div', id='offer-section').find('strong').text.strip() store_item['name'] = store_item['title'] store_item['site'] = 'offers' store_item['url_name'] = response.url.split('/')[-2] store_item['description'] = soup.find( 'div', id='company-information').find('p').text store_item['category'] = soup.find_all( 'a', itemprop='item')[-1].find('span').text store_item['website'] = get_real_url( self.base_url + soup.find('div', id='company-identity').a.get('href')) store_item['country'] = "US" store_item['picture'] = scrapy.Field() store_item['coupon_count'] = soup.find( 'div', id='merchant-stats').find('tr').find('span').text store_item['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') store_item['final_website'] = get_domin_url(store_item['website']) if store_item['final_website'] == '' or store_item[ 'final_website'] is None or store_item[ 'final_website'] == '#' or store_item[ 'final_website'] == 'https://www.offers.com': print(store_item['final_website']) # coupon for offer in soup.find_all('div', class_='offerstrip'): if 'expired' in offer.parent.get('class'): continue coupon_item = CouponItem() coupon_item['type'] = 'coupon' coupon_item['name'] = offer.find('h3', class_='name').text.strip() coupon_item['site'] = 'offers' description = offer.find('div', class_='more-details') coupon_item['description'] = description.find( 'p').text.strip() if description else "" try: coupon_item['verify'] = 'Y' if offer.find( 'span', class_='verified').find( 'strong').text == "Verified" else "N" except: coupon_item['verify'] = 'N' coupon_item['link'] = self.base_url + offer.find('a').get('href') coupon_item['expire_at'] = None try: div = offer.find('div', class_='badge-text') span = offer.find('span', class_='dolphin flag') coupon_type = div.text if div else '' coupon_type += span.text if span else '' except: coupon_item['coupon_type'] = "DEAL" if 'code' in coupon_type: data_offer_id = offer.get('data-offer-id') long_id = coupon_item['link'].split('/')[-2] code_get_url = self.code_url.replace('code_id', data_offer_id).replace( 'long_id', long_id) res = requests.get(code_get_url, headers=get_header()) code = re.findall(r'<div class="coupon-code">(.+?)</div>', res.content.decode()) coupon_item['code'] = code[0] if code else '' coupon_item['coupon_type'] = "CODE" else: coupon_item['coupon_type'] = "DEAL" coupon_item['code'] = '' coupon_item['final_website'] = store_item['final_website'] coupon_item['store'] = store_item['title'] coupon_item['store_url_name'] = store_item['url_name'] coupon_item['store_description'] = store_item['description'] coupon_item['store_category'] = store_item['category'] coupon_item['store_website'] = store_item['website'] coupon_item['store_country'] = "US" coupon_item['store_picture'] = store_item['logo_url'] coupon_item['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') coupon_item['status'] = '0' # coupon_item['depth'] = scrapy.Field() # coupon_item['download_timeout'] = scrapy.Field() # coupon_item['download_slot'] = scrapy.Field() # coupon_item['download_latency'] = scrapy.Field() yield coupon_item yield store_item pass