def __init__(self, brand): self.brand = brand self.proxy_obj = ProxyRotator() self.PROXY = json.loads(self.proxy_obj.rotate_proxy()) print self.PROXY self.session = requests self.session.headers = self.HEADERS
class Scraper(object): HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.73 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset':'utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip,deflate,sdch', 'Accept-Language':'it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4', 'Connection':'keep-alive', 'Content-Type':'application/x-www-form-urlencoded', # 'Referer': '', # 'Origin': '', # 'Host': '' } # PROXY = {'http':'http://*****:*****@38.78.197.196:60099'} PROXY = {} proxy_obj = ProxyRotator() def __init__(self, *args, **kwargs): self.proxy_obj = ProxyRotator() self.PROXY = json.loads(self.proxy_obj.rotate_proxy()) print self.PROXY self.session = requests self.session.headers = self.HEADERS def clean_results(self, price): try: return float(re.search('[\d\.,]+', price).group(0).replace(',', '')) except AttributeError: return 0
class PlccenterScraper(object): HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.73 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset':'utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip,deflate,sdch', 'Accept-Language':'it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4', 'Connection':'keep-alive', 'Content-Type':'application/x-www-form-urlencoded', # 'Referer': '', # 'Origin': '', # 'Host': '' } PROXY = {} proxy_obj = ProxyRotator() def __init__(self, brand): self.brand = brand self.proxy_obj = ProxyRotator() self.PROXY = json.loads(self.proxy_obj.rotate_proxy()) print self.PROXY self.session = requests self.session.headers = self.HEADERS def clean_results(self, price): try: return float(re.search('[\d\.,]+', price).group(0).replace(',', '')) except AttributeError: return 0 def get_price(self, part_num): part_num = part_num.strip() if self.brand == 'Redlion': url = 'http://www.plccenter.com/en-US/Buy/RED LION CONTROLS/%s' % part_num else: try: part_num = '%06d' % int(part_num) url = 'http://www.plccenter.com/en-US/Buy/RELIANCE ELECTRIC/%s' % part_num except: pass url = 'http://www.plccenter.com/en-US/Buy/DODGE/%s' % part_num print self.proxy_obj print self.proxy_obj.rotate_proxy() data = self.proxy_obj.rotate_proxy() self.PROXY = json.loads(data) print url try: resp = self.session.get(url, proxies=self.PROXY) print resp.url except Exception as e: print e return 0 x = html.fromstring(resp.content) try: price = x.xpath(".//div[contains(@class,'productDetailPriceRowOurPrice')]/text()")[0].strip() if 'Est.' in price: price = price.split('Est.')[1] print price except IndexError as e: print e print url return 0 return self.clean_results(price) def scrap(self, brand): pass